core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
34    unsafe { simd_add(a, b) }
35}
36
37/// Adds packed single-precision (32-bit) floating-point elements in `a` and
38/// `b`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
41#[inline]
42#[target_feature(enable = "avx")]
43#[cfg_attr(test, assert_instr(vaddps))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
46    unsafe { simd_add(a, b) }
47}
48
49/// Computes the bitwise AND of a packed double-precision (64-bit)
50/// floating-point elements in `a` and `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
53#[inline]
54#[target_feature(enable = "avx")]
55// See https://github.com/rust-lang/stdarch/issues/71
56#[cfg_attr(test, assert_instr(vandp))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
59    unsafe {
60        let a: u64x4 = transmute(a);
61        let b: u64x4 = transmute(b);
62        transmute(simd_and(a, b))
63    }
64}
65
66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
67/// elements in `a` and `b`.
68///
69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
73#[stable(feature = "simd_x86", since = "1.27.0")]
74pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
75    unsafe {
76        let a: u32x8 = transmute(a);
77        let b: u32x8 = transmute(b);
78        transmute(simd_and(a, b))
79    }
80}
81
82/// Computes the bitwise OR packed double-precision (64-bit) floating-point
83/// elements in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
86#[inline]
87#[target_feature(enable = "avx")]
88// See <https://github.com/rust-lang/stdarch/issues/71>.
89#[cfg_attr(test, assert_instr(vorp))]
90#[stable(feature = "simd_x86", since = "1.27.0")]
91pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
92    unsafe {
93        let a: u64x4 = transmute(a);
94        let b: u64x4 = transmute(b);
95        transmute(simd_or(a, b))
96    }
97}
98
99/// Computes the bitwise OR packed single-precision (32-bit) floating-point
100/// elements in `a` and `b`.
101///
102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
103#[inline]
104#[target_feature(enable = "avx")]
105#[cfg_attr(test, assert_instr(vorps))]
106#[stable(feature = "simd_x86", since = "1.27.0")]
107pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
108    unsafe {
109        let a: u32x8 = transmute(a);
110        let b: u32x8 = transmute(b);
111        transmute(simd_or(a, b))
112    }
113}
114
115/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
116/// lanes using the control in `imm8`.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
119#[inline]
120#[target_feature(enable = "avx")]
121#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
122#[rustc_legacy_const_generics(2)]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
125    static_assert_uimm_bits!(MASK, 8);
126    unsafe {
127        simd_shuffle!(
128            a,
129            b,
130            [
131                MASK as u32 & 0b1,
132                ((MASK as u32 >> 1) & 0b1) + 4,
133                ((MASK as u32 >> 2) & 0b1) + 2,
134                ((MASK as u32 >> 3) & 0b1) + 6,
135            ],
136        )
137    }
138}
139
140/// Shuffles single-precision (32-bit) floating-point elements in `a` within
141/// 128-bit lanes using the control in `imm8`.
142///
143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
144#[inline]
145#[target_feature(enable = "avx")]
146#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
147#[rustc_legacy_const_generics(2)]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
150    static_assert_uimm_bits!(MASK, 8);
151    unsafe {
152        simd_shuffle!(
153            a,
154            b,
155            [
156                MASK as u32 & 0b11,
157                (MASK as u32 >> 2) & 0b11,
158                ((MASK as u32 >> 4) & 0b11) + 8,
159                ((MASK as u32 >> 6) & 0b11) + 8,
160                (MASK as u32 & 0b11) + 4,
161                ((MASK as u32 >> 2) & 0b11) + 4,
162                ((MASK as u32 >> 4) & 0b11) + 12,
163                ((MASK as u32 >> 6) & 0b11) + 12,
164            ],
165        )
166    }
167}
168
169/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
170/// elements in `a`, and then AND with `b`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
173#[inline]
174#[target_feature(enable = "avx")]
175#[cfg_attr(test, assert_instr(vandnp))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
178    unsafe {
179        let a: u64x4 = transmute(a);
180        let b: u64x4 = transmute(b);
181        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
182    }
183}
184
185/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
186/// elements in `a`
187/// and then AND with `b`.
188///
189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
190#[inline]
191#[target_feature(enable = "avx")]
192#[cfg_attr(test, assert_instr(vandnps))]
193#[stable(feature = "simd_x86", since = "1.27.0")]
194pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
195    unsafe {
196        let a: u32x8 = transmute(a);
197        let b: u32x8 = transmute(b);
198        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
199    }
200}
201
202/// Compares packed double-precision (64-bit) floating-point elements
203/// in `a` and `b`, and returns packed maximum values
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxpd))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
211    unsafe { vmaxpd(a, b) }
212}
213
214/// Compares packed single-precision (32-bit) floating-point elements in `a`
215/// and `b`, and returns packed maximum values
216///
217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vmaxps))]
221#[stable(feature = "simd_x86", since = "1.27.0")]
222pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
223    unsafe { vmaxps(a, b) }
224}
225
226/// Compares packed double-precision (64-bit) floating-point elements
227/// in `a` and `b`, and returns packed minimum values
228///
229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminpd))]
233#[stable(feature = "simd_x86", since = "1.27.0")]
234pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
235    unsafe { vminpd(a, b) }
236}
237
238/// Compares packed single-precision (32-bit) floating-point elements in `a`
239/// and `b`, and returns packed minimum values
240///
241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vminps))]
245#[stable(feature = "simd_x86", since = "1.27.0")]
246pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
247    unsafe { vminps(a, b) }
248}
249
250/// Multiplies packed double-precision (64-bit) floating-point elements
251/// in `a` and `b`.
252///
253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulpd))]
257#[stable(feature = "simd_x86", since = "1.27.0")]
258pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
259    unsafe { simd_mul(a, b) }
260}
261
262/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
263/// `b`.
264///
265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vmulps))]
269#[stable(feature = "simd_x86", since = "1.27.0")]
270pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
271    unsafe { simd_mul(a, b) }
272}
273
274/// Alternatively adds and subtracts packed double-precision (64-bit)
275/// floating-point elements in `a` to/from packed elements in `b`.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubpd))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
283    unsafe {
284        let a = a.as_f64x4();
285        let b = b.as_f64x4();
286        let add = simd_add(a, b);
287        let sub = simd_sub(a, b);
288        simd_shuffle!(add, sub, [4, 1, 6, 3])
289    }
290}
291
292/// Alternatively adds and subtracts packed single-precision (32-bit)
293/// floating-point elements in `a` to/from packed elements in `b`.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
296#[inline]
297#[target_feature(enable = "avx")]
298#[cfg_attr(test, assert_instr(vaddsubps))]
299#[stable(feature = "simd_x86", since = "1.27.0")]
300pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
301    unsafe {
302        let a = a.as_f32x8();
303        let b = b.as_f32x8();
304        let add = simd_add(a, b);
305        let sub = simd_sub(a, b);
306        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
307    }
308}
309
310/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
311/// from packed elements in `a`.
312///
313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vsubpd))]
317#[stable(feature = "simd_x86", since = "1.27.0")]
318pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
319    unsafe { simd_sub(a, b) }
320}
321
322/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
323/// from packed elements in `a`.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vsubps))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
331    unsafe { simd_sub(a, b) }
332}
333
334/// Computes the division of each of the 8 packed 32-bit floating-point elements
335/// in `a` by the corresponding packed elements in `b`.
336///
337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
338#[inline]
339#[target_feature(enable = "avx")]
340#[cfg_attr(test, assert_instr(vdivps))]
341#[stable(feature = "simd_x86", since = "1.27.0")]
342pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
343    unsafe { simd_div(a, b) }
344}
345
346/// Computes the division of each of the 4 packed 64-bit floating-point elements
347/// in `a` by the corresponding packed elements in `b`.
348///
349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
350#[inline]
351#[target_feature(enable = "avx")]
352#[cfg_attr(test, assert_instr(vdivpd))]
353#[stable(feature = "simd_x86", since = "1.27.0")]
354pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
355    unsafe { simd_div(a, b) }
356}
357
358/// Rounds packed double-precision (64-bit) floating point elements in `a`
359/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
360///
361/// - `0x00`: Round to the nearest whole number.
362/// - `0x01`: Round down, toward negative infinity.
363/// - `0x02`: Round up, toward positive infinity.
364/// - `0x03`: Truncate the values.
365///
366/// For a complete list of options, check [the LLVM docs][llvm_docs].
367///
368/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
369///
370/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
371#[inline]
372#[target_feature(enable = "avx")]
373#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
374#[rustc_legacy_const_generics(1)]
375#[stable(feature = "simd_x86", since = "1.27.0")]
376pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
377    static_assert_uimm_bits!(ROUNDING, 4);
378    unsafe { roundpd256(a, ROUNDING) }
379}
380
381/// Rounds packed double-precision (64-bit) floating point elements in `a`
382/// toward positive infinity.
383///
384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
385#[inline]
386#[target_feature(enable = "avx")]
387#[cfg_attr(test, assert_instr(vroundpd))]
388#[stable(feature = "simd_x86", since = "1.27.0")]
389pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
390    unsafe { simd_ceil(a) }
391}
392
393/// Rounds packed double-precision (64-bit) floating point elements in `a`
394/// toward negative infinity.
395///
396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
397#[inline]
398#[target_feature(enable = "avx")]
399#[cfg_attr(test, assert_instr(vroundpd))]
400#[stable(feature = "simd_x86", since = "1.27.0")]
401pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
402    unsafe { simd_floor(a) }
403}
404
405/// Rounds packed single-precision (32-bit) floating point elements in `a`
406/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
407///
408/// - `0x00`: Round to the nearest whole number.
409/// - `0x01`: Round down, toward negative infinity.
410/// - `0x02`: Round up, toward positive infinity.
411/// - `0x03`: Truncate the values.
412///
413/// For a complete list of options, check [the LLVM docs][llvm_docs].
414///
415/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
418#[inline]
419#[target_feature(enable = "avx")]
420#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
421#[rustc_legacy_const_generics(1)]
422#[stable(feature = "simd_x86", since = "1.27.0")]
423pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
424    static_assert_uimm_bits!(ROUNDING, 4);
425    unsafe { roundps256(a, ROUNDING) }
426}
427
428/// Rounds packed single-precision (32-bit) floating point elements in `a`
429/// toward positive infinity.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vroundps))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
437    unsafe { simd_ceil(a) }
438}
439
440/// Rounds packed single-precision (32-bit) floating point elements in `a`
441/// toward negative infinity.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vroundps))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub fn _mm256_floor_ps(a: __m256) -> __m256 {
449    unsafe { simd_floor(a) }
450}
451
452/// Returns the square root of packed single-precision (32-bit) floating point
453/// elements in `a`.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
456#[inline]
457#[target_feature(enable = "avx")]
458#[cfg_attr(test, assert_instr(vsqrtps))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
461    unsafe { simd_fsqrt(a) }
462}
463
464/// Returns the square root of packed double-precision (64-bit) floating point
465/// elements in `a`.
466///
467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
468#[inline]
469#[target_feature(enable = "avx")]
470#[cfg_attr(test, assert_instr(vsqrtpd))]
471#[stable(feature = "simd_x86", since = "1.27.0")]
472pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
473    unsafe { simd_fsqrt(a) }
474}
475
476/// Blends packed double-precision (64-bit) floating-point elements from
477/// `a` and `b` using control mask `imm8`.
478///
479/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
480#[inline]
481#[target_feature(enable = "avx")]
482// Note: LLVM7 prefers single-precision blend instructions when
483// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
484// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
485#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
486#[rustc_legacy_const_generics(2)]
487#[stable(feature = "simd_x86", since = "1.27.0")]
488pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
489    static_assert_uimm_bits!(IMM4, 4);
490    unsafe {
491        simd_shuffle!(
492            a,
493            b,
494            [
495                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
496                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
497                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
498                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
499            ],
500        )
501    }
502}
503
504/// Blends packed single-precision (32-bit) floating-point elements from
505/// `a` and `b` using control mask `imm8`.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
508#[inline]
509#[target_feature(enable = "avx")]
510#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
511#[rustc_legacy_const_generics(2)]
512#[stable(feature = "simd_x86", since = "1.27.0")]
513pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
514    static_assert_uimm_bits!(IMM8, 8);
515    unsafe {
516        simd_shuffle!(
517            a,
518            b,
519            [
520                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
521                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
522                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
523                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
524                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
525                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
526                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
527                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
528            ],
529        )
530    }
531}
532
533/// Blends packed double-precision (64-bit) floating-point elements from
534/// `a` and `b` using `c` as a mask.
535///
536/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
537#[inline]
538#[target_feature(enable = "avx")]
539#[cfg_attr(test, assert_instr(vblendvpd))]
540#[stable(feature = "simd_x86", since = "1.27.0")]
541pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
542    unsafe {
543        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
544        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
545    }
546}
547
548/// Blends packed single-precision (32-bit) floating-point elements from
549/// `a` and `b` using `c` as a mask.
550///
551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
552#[inline]
553#[target_feature(enable = "avx")]
554#[cfg_attr(test, assert_instr(vblendvps))]
555#[stable(feature = "simd_x86", since = "1.27.0")]
556pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
557    unsafe {
558        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
559        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
560    }
561}
562
563/// Conditionally multiplies the packed single-precision (32-bit) floating-point
564/// elements in `a` and `b` using the high 4 bits in `imm8`,
565/// sum the four products, and conditionally return the sum
566///  using the low 4 bits of `imm8`.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
569#[inline]
570#[target_feature(enable = "avx")]
571#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
572#[rustc_legacy_const_generics(2)]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
575    static_assert_uimm_bits!(IMM8, 8);
576    unsafe { vdpps(a, b, IMM8 as i8) }
577}
578
579/// Horizontal addition of adjacent pairs in the two packed vectors
580/// of 4 64-bit floating points `a` and `b`.
581/// In the result, sums of elements from `a` are returned in even locations,
582/// while sums of elements from `b` are returned in odd locations.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
585#[inline]
586#[target_feature(enable = "avx")]
587#[cfg_attr(test, assert_instr(vhaddpd))]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
590    unsafe {
591        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
592        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
593        simd_add(even, odd)
594    }
595}
596
597/// Horizontal addition of adjacent pairs in the two packed vectors
598/// of 8 32-bit floating points `a` and `b`.
599/// In the result, sums of elements from `a` are returned in locations of
600/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
601/// 2, 3, 6, 7.
602///
603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
604#[inline]
605#[target_feature(enable = "avx")]
606#[cfg_attr(test, assert_instr(vhaddps))]
607#[stable(feature = "simd_x86", since = "1.27.0")]
608pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
609    unsafe {
610        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
611        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
612        simd_add(even, odd)
613    }
614}
615
616/// Horizontal subtraction of adjacent pairs in the two packed vectors
617/// of 4 64-bit floating points `a` and `b`.
618/// In the result, sums of elements from `a` are returned in even locations,
619/// while sums of elements from `b` are returned in odd locations.
620///
621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
622#[inline]
623#[target_feature(enable = "avx")]
624#[cfg_attr(test, assert_instr(vhsubpd))]
625#[stable(feature = "simd_x86", since = "1.27.0")]
626pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
627    unsafe {
628        let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
629        let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
630        simd_sub(even, odd)
631    }
632}
633
634/// Horizontal subtraction of adjacent pairs in the two packed vectors
635/// of 8 32-bit floating points `a` and `b`.
636/// In the result, sums of elements from `a` are returned in locations of
637/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
638/// 2, 3, 6, 7.
639///
640/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
641#[inline]
642#[target_feature(enable = "avx")]
643#[cfg_attr(test, assert_instr(vhsubps))]
644#[stable(feature = "simd_x86", since = "1.27.0")]
645pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
646    unsafe {
647        let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
648        let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
649        simd_sub(even, odd)
650    }
651}
652
653/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
654/// elements in `a` and `b`.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
657#[inline]
658#[target_feature(enable = "avx")]
659#[cfg_attr(test, assert_instr(vxorp))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
662    unsafe {
663        let a: u64x4 = transmute(a);
664        let b: u64x4 = transmute(b);
665        transmute(simd_xor(a, b))
666    }
667}
668
669/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
670/// elements in `a` and `b`.
671///
672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
673#[inline]
674#[target_feature(enable = "avx")]
675#[cfg_attr(test, assert_instr(vxorps))]
676#[stable(feature = "simd_x86", since = "1.27.0")]
677pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
678    unsafe {
679        let a: u32x8 = transmute(a);
680        let b: u32x8 = transmute(b);
681        transmute(simd_xor(a, b))
682    }
683}
684
685/// Equal (ordered, non-signaling)
686#[stable(feature = "simd_x86", since = "1.27.0")]
687pub const _CMP_EQ_OQ: i32 = 0x00;
688/// Less-than (ordered, signaling)
689#[stable(feature = "simd_x86", since = "1.27.0")]
690pub const _CMP_LT_OS: i32 = 0x01;
691/// Less-than-or-equal (ordered, signaling)
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub const _CMP_LE_OS: i32 = 0x02;
694/// Unordered (non-signaling)
695#[stable(feature = "simd_x86", since = "1.27.0")]
696pub const _CMP_UNORD_Q: i32 = 0x03;
697/// Not-equal (unordered, non-signaling)
698#[stable(feature = "simd_x86", since = "1.27.0")]
699pub const _CMP_NEQ_UQ: i32 = 0x04;
700/// Not-less-than (unordered, signaling)
701#[stable(feature = "simd_x86", since = "1.27.0")]
702pub const _CMP_NLT_US: i32 = 0x05;
703/// Not-less-than-or-equal (unordered, signaling)
704#[stable(feature = "simd_x86", since = "1.27.0")]
705pub const _CMP_NLE_US: i32 = 0x06;
706/// Ordered (non-signaling)
707#[stable(feature = "simd_x86", since = "1.27.0")]
708pub const _CMP_ORD_Q: i32 = 0x07;
709/// Equal (unordered, non-signaling)
710#[stable(feature = "simd_x86", since = "1.27.0")]
711pub const _CMP_EQ_UQ: i32 = 0x08;
712/// Not-greater-than-or-equal (unordered, signaling)
713#[stable(feature = "simd_x86", since = "1.27.0")]
714pub const _CMP_NGE_US: i32 = 0x09;
715/// Not-greater-than (unordered, signaling)
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub const _CMP_NGT_US: i32 = 0x0a;
718/// False (ordered, non-signaling)
719#[stable(feature = "simd_x86", since = "1.27.0")]
720pub const _CMP_FALSE_OQ: i32 = 0x0b;
721/// Not-equal (ordered, non-signaling)
722#[stable(feature = "simd_x86", since = "1.27.0")]
723pub const _CMP_NEQ_OQ: i32 = 0x0c;
724/// Greater-than-or-equal (ordered, signaling)
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub const _CMP_GE_OS: i32 = 0x0d;
727/// Greater-than (ordered, signaling)
728#[stable(feature = "simd_x86", since = "1.27.0")]
729pub const _CMP_GT_OS: i32 = 0x0e;
730/// True (unordered, non-signaling)
731#[stable(feature = "simd_x86", since = "1.27.0")]
732pub const _CMP_TRUE_UQ: i32 = 0x0f;
733/// Equal (ordered, signaling)
734#[stable(feature = "simd_x86", since = "1.27.0")]
735pub const _CMP_EQ_OS: i32 = 0x10;
736/// Less-than (ordered, non-signaling)
737#[stable(feature = "simd_x86", since = "1.27.0")]
738pub const _CMP_LT_OQ: i32 = 0x11;
739/// Less-than-or-equal (ordered, non-signaling)
740#[stable(feature = "simd_x86", since = "1.27.0")]
741pub const _CMP_LE_OQ: i32 = 0x12;
742/// Unordered (signaling)
743#[stable(feature = "simd_x86", since = "1.27.0")]
744pub const _CMP_UNORD_S: i32 = 0x13;
745/// Not-equal (unordered, signaling)
746#[stable(feature = "simd_x86", since = "1.27.0")]
747pub const _CMP_NEQ_US: i32 = 0x14;
748/// Not-less-than (unordered, non-signaling)
749#[stable(feature = "simd_x86", since = "1.27.0")]
750pub const _CMP_NLT_UQ: i32 = 0x15;
751/// Not-less-than-or-equal (unordered, non-signaling)
752#[stable(feature = "simd_x86", since = "1.27.0")]
753pub const _CMP_NLE_UQ: i32 = 0x16;
754/// Ordered (signaling)
755#[stable(feature = "simd_x86", since = "1.27.0")]
756pub const _CMP_ORD_S: i32 = 0x17;
757/// Equal (unordered, signaling)
758#[stable(feature = "simd_x86", since = "1.27.0")]
759pub const _CMP_EQ_US: i32 = 0x18;
760/// Not-greater-than-or-equal (unordered, non-signaling)
761#[stable(feature = "simd_x86", since = "1.27.0")]
762pub const _CMP_NGE_UQ: i32 = 0x19;
763/// Not-greater-than (unordered, non-signaling)
764#[stable(feature = "simd_x86", since = "1.27.0")]
765pub const _CMP_NGT_UQ: i32 = 0x1a;
766/// False (ordered, signaling)
767#[stable(feature = "simd_x86", since = "1.27.0")]
768pub const _CMP_FALSE_OS: i32 = 0x1b;
769/// Not-equal (ordered, signaling)
770#[stable(feature = "simd_x86", since = "1.27.0")]
771pub const _CMP_NEQ_OS: i32 = 0x1c;
772/// Greater-than-or-equal (ordered, non-signaling)
773#[stable(feature = "simd_x86", since = "1.27.0")]
774pub const _CMP_GE_OQ: i32 = 0x1d;
775/// Greater-than (ordered, non-signaling)
776#[stable(feature = "simd_x86", since = "1.27.0")]
777pub const _CMP_GT_OQ: i32 = 0x1e;
778/// True (unordered, signaling)
779#[stable(feature = "simd_x86", since = "1.27.0")]
780pub const _CMP_TRUE_US: i32 = 0x1f;
781
782/// Compares packed double-precision (64-bit) floating-point
783/// elements in `a` and `b` based on the comparison operand
784/// specified by `IMM5`.
785///
786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
787#[inline]
788#[target_feature(enable = "avx")]
789#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
790#[rustc_legacy_const_generics(2)]
791#[stable(feature = "simd_x86", since = "1.27.0")]
792pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
793    static_assert_uimm_bits!(IMM5, 5);
794    unsafe { vcmppd(a, b, const { IMM5 as i8 }) }
795}
796
797/// Compares packed double-precision (64-bit) floating-point
798/// elements in `a` and `b` based on the comparison operand
799/// specified by `IMM5`.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
802#[inline]
803#[target_feature(enable = "avx")]
804#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
805#[rustc_legacy_const_generics(2)]
806#[stable(feature = "simd_x86", since = "1.27.0")]
807pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
808    static_assert_uimm_bits!(IMM5, 5);
809    unsafe { vcmppd256(a, b, IMM5 as u8) }
810}
811
812/// Compares packed single-precision (32-bit) floating-point
813/// elements in `a` and `b` based on the comparison operand
814/// specified by `IMM5`.
815///
816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
817#[inline]
818#[target_feature(enable = "avx")]
819#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
820#[rustc_legacy_const_generics(2)]
821#[stable(feature = "simd_x86", since = "1.27.0")]
822pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
823    static_assert_uimm_bits!(IMM5, 5);
824    unsafe { vcmpps(a, b, const { IMM5 as i8 }) }
825}
826
827/// Compares packed single-precision (32-bit) floating-point
828/// elements in `a` and `b` based on the comparison operand
829/// specified by `IMM5`.
830///
831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
832#[inline]
833#[target_feature(enable = "avx")]
834#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
835#[rustc_legacy_const_generics(2)]
836#[stable(feature = "simd_x86", since = "1.27.0")]
837pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
838    static_assert_uimm_bits!(IMM5, 5);
839    unsafe { vcmpps256(a, b, const { IMM5 as u8 }) }
840}
841
842/// Compares the lower double-precision (64-bit) floating-point element in
843/// `a` and `b` based on the comparison operand specified by `IMM5`,
844/// store the result in the lower element of returned vector,
845/// and copies the upper element from `a` to the upper element of returned
846/// vector.
847///
848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
849#[inline]
850#[target_feature(enable = "avx")]
851#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
852#[rustc_legacy_const_generics(2)]
853#[stable(feature = "simd_x86", since = "1.27.0")]
854pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
855    static_assert_uimm_bits!(IMM5, 5);
856    unsafe { vcmpsd(a, b, IMM5 as i8) }
857}
858
859/// Compares the lower single-precision (32-bit) floating-point element in
860/// `a` and `b` based on the comparison operand specified by `IMM5`,
861/// store the result in the lower element of returned vector,
862/// and copies the upper 3 packed elements from `a` to the upper elements of
863/// returned vector.
864///
865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
866#[inline]
867#[target_feature(enable = "avx")]
868#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
869#[rustc_legacy_const_generics(2)]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
872    static_assert_uimm_bits!(IMM5, 5);
873    unsafe { vcmpss(a, b, IMM5 as i8) }
874}
875
876/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
877/// floating-point elements.
878///
879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
880#[inline]
881#[target_feature(enable = "avx")]
882#[cfg_attr(test, assert_instr(vcvtdq2pd))]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
885    unsafe { simd_cast(a.as_i32x4()) }
886}
887
888/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
889/// floating-point elements.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
892#[inline]
893#[target_feature(enable = "avx")]
894#[cfg_attr(test, assert_instr(vcvtdq2ps))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
897    unsafe { simd_cast(a.as_i32x8()) }
898}
899
900/// Converts packed double-precision (64-bit) floating-point elements in `a`
901/// to packed single-precision (32-bit) floating-point elements.
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
904#[inline]
905#[target_feature(enable = "avx")]
906#[cfg_attr(test, assert_instr(vcvtpd2ps))]
907#[stable(feature = "simd_x86", since = "1.27.0")]
908pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
909    unsafe { simd_cast(a) }
910}
911
912/// Converts packed single-precision (32-bit) floating-point elements in `a`
913/// to packed 32-bit integers.
914///
915/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
916#[inline]
917#[target_feature(enable = "avx")]
918#[cfg_attr(test, assert_instr(vcvtps2dq))]
919#[stable(feature = "simd_x86", since = "1.27.0")]
920pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
921    unsafe { transmute(vcvtps2dq(a)) }
922}
923
924/// Converts packed single-precision (32-bit) floating-point elements in `a`
925/// to packed double-precision (64-bit) floating-point elements.
926///
927/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
928#[inline]
929#[target_feature(enable = "avx")]
930#[cfg_attr(test, assert_instr(vcvtps2pd))]
931#[stable(feature = "simd_x86", since = "1.27.0")]
932pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
933    unsafe { simd_cast(a) }
934}
935
936/// Returns the first element of the input vector of `[4 x double]`.
937///
938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
939#[inline]
940#[target_feature(enable = "avx")]
941//#[cfg_attr(test, assert_instr(movsd))] FIXME
942#[stable(feature = "simd_x86", since = "1.27.0")]
943pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
944    unsafe { simd_extract!(a, 0) }
945}
946
947/// Converts packed double-precision (64-bit) floating-point elements in `a`
948/// to packed 32-bit integers with truncation.
949///
950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
951#[inline]
952#[target_feature(enable = "avx")]
953#[cfg_attr(test, assert_instr(vcvttpd2dq))]
954#[stable(feature = "simd_x86", since = "1.27.0")]
955pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
956    unsafe { transmute(vcvttpd2dq(a)) }
957}
958
959/// Converts packed double-precision (64-bit) floating-point elements in `a`
960/// to packed 32-bit integers.
961///
962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
963#[inline]
964#[target_feature(enable = "avx")]
965#[cfg_attr(test, assert_instr(vcvtpd2dq))]
966#[stable(feature = "simd_x86", since = "1.27.0")]
967pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
968    unsafe { transmute(vcvtpd2dq(a)) }
969}
970
971/// Converts packed single-precision (32-bit) floating-point elements in `a`
972/// to packed 32-bit integers with truncation.
973///
974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
975#[inline]
976#[target_feature(enable = "avx")]
977#[cfg_attr(test, assert_instr(vcvttps2dq))]
978#[stable(feature = "simd_x86", since = "1.27.0")]
979pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
980    unsafe { transmute(vcvttps2dq(a)) }
981}
982
983/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
984/// floating-point elements) from `a`, selected with `imm8`.
985///
986/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
987#[inline]
988#[target_feature(enable = "avx")]
989#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
990#[rustc_legacy_const_generics(1)]
991#[stable(feature = "simd_x86", since = "1.27.0")]
992pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
993    static_assert_uimm_bits!(IMM1, 1);
994    unsafe {
995        simd_shuffle!(
996            a,
997            _mm256_undefined_ps(),
998            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
999        )
1000    }
1001}
1002
1003/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
1004/// floating-point elements) from `a`, selected with `imm8`.
1005///
1006/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
1007#[inline]
1008#[target_feature(enable = "avx")]
1009#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1010#[rustc_legacy_const_generics(1)]
1011#[stable(feature = "simd_x86", since = "1.27.0")]
1012pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
1013    static_assert_uimm_bits!(IMM1, 1);
1014    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) }
1015}
1016
1017/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
1018///
1019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
1020#[inline]
1021#[target_feature(enable = "avx")]
1022#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
1023#[rustc_legacy_const_generics(1)]
1024#[stable(feature = "simd_x86", since = "1.27.0")]
1025pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
1026    static_assert_uimm_bits!(IMM1, 1);
1027    unsafe {
1028        let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
1029        transmute(dst)
1030    }
1031}
1032
1033/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
1034///
1035/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
1036#[inline]
1037#[target_feature(enable = "avx")]
1038// This intrinsic has no corresponding instruction.
1039#[rustc_legacy_const_generics(1)]
1040#[stable(feature = "simd_x86", since = "1.27.0")]
1041pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
1042    static_assert_uimm_bits!(INDEX, 3);
1043    unsafe { simd_extract!(a.as_i32x8(), INDEX as u32) }
1044}
1045
1046/// Returns the first element of the input vector of `[8 x i32]`.
1047///
1048/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1049#[inline]
1050#[target_feature(enable = "avx")]
1051#[stable(feature = "simd_x86", since = "1.27.0")]
1052pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1053    unsafe { simd_extract!(a.as_i32x8(), 0) }
1054}
1055
1056/// Zeroes the contents of all XMM or YMM registers.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1059#[inline]
1060#[target_feature(enable = "avx")]
1061#[cfg_attr(test, assert_instr(vzeroall))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm256_zeroall() {
1064    unsafe { vzeroall() }
1065}
1066
1067/// Zeroes the upper 128 bits of all YMM registers;
1068/// the lower 128-bits of the registers are unmodified.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1071#[inline]
1072#[target_feature(enable = "avx")]
1073#[cfg_attr(test, assert_instr(vzeroupper))]
1074#[stable(feature = "simd_x86", since = "1.27.0")]
1075pub fn _mm256_zeroupper() {
1076    unsafe { vzeroupper() }
1077}
1078
1079/// Shuffles single-precision (32-bit) floating-point elements in `a`
1080/// within 128-bit lanes using the control in `b`.
1081///
1082/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1083#[inline]
1084#[target_feature(enable = "avx")]
1085#[cfg_attr(test, assert_instr(vpermilps))]
1086#[stable(feature = "simd_x86", since = "1.27.0")]
1087pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1088    unsafe { vpermilps256(a, b.as_i32x8()) }
1089}
1090
1091/// Shuffles single-precision (32-bit) floating-point elements in `a`
1092/// using the control in `b`.
1093///
1094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1095#[inline]
1096#[target_feature(enable = "avx")]
1097#[cfg_attr(test, assert_instr(vpermilps))]
1098#[stable(feature = "simd_x86", since = "1.27.0")]
1099pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1100    unsafe { vpermilps(a, b.as_i32x4()) }
1101}
1102
1103/// Shuffles single-precision (32-bit) floating-point elements in `a`
1104/// within 128-bit lanes using the control in `imm8`.
1105///
1106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1107#[inline]
1108#[target_feature(enable = "avx")]
1109#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1110#[rustc_legacy_const_generics(1)]
1111#[stable(feature = "simd_x86", since = "1.27.0")]
1112pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1113    static_assert_uimm_bits!(IMM8, 8);
1114    unsafe {
1115        simd_shuffle!(
1116            a,
1117            _mm256_undefined_ps(),
1118            [
1119                (IMM8 as u32 >> 0) & 0b11,
1120                (IMM8 as u32 >> 2) & 0b11,
1121                (IMM8 as u32 >> 4) & 0b11,
1122                (IMM8 as u32 >> 6) & 0b11,
1123                ((IMM8 as u32 >> 0) & 0b11) + 4,
1124                ((IMM8 as u32 >> 2) & 0b11) + 4,
1125                ((IMM8 as u32 >> 4) & 0b11) + 4,
1126                ((IMM8 as u32 >> 6) & 0b11) + 4,
1127            ],
1128        )
1129    }
1130}
1131
1132/// Shuffles single-precision (32-bit) floating-point elements in `a`
1133/// using the control in `imm8`.
1134///
1135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1136#[inline]
1137#[target_feature(enable = "avx")]
1138#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1139#[rustc_legacy_const_generics(1)]
1140#[stable(feature = "simd_x86", since = "1.27.0")]
1141pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1142    static_assert_uimm_bits!(IMM8, 8);
1143    unsafe {
1144        simd_shuffle!(
1145            a,
1146            _mm_undefined_ps(),
1147            [
1148                (IMM8 as u32 >> 0) & 0b11,
1149                (IMM8 as u32 >> 2) & 0b11,
1150                (IMM8 as u32 >> 4) & 0b11,
1151                (IMM8 as u32 >> 6) & 0b11,
1152            ],
1153        )
1154    }
1155}
1156
1157/// Shuffles double-precision (64-bit) floating-point elements in `a`
1158/// within 256-bit lanes using the control in `b`.
1159///
1160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1161#[inline]
1162#[target_feature(enable = "avx")]
1163#[cfg_attr(test, assert_instr(vpermilpd))]
1164#[stable(feature = "simd_x86", since = "1.27.0")]
1165pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1166    unsafe { vpermilpd256(a, b.as_i64x4()) }
1167}
1168
1169/// Shuffles double-precision (64-bit) floating-point elements in `a`
1170/// using the control in `b`.
1171///
1172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1173#[inline]
1174#[target_feature(enable = "avx")]
1175#[cfg_attr(test, assert_instr(vpermilpd))]
1176#[stable(feature = "simd_x86", since = "1.27.0")]
1177pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1178    unsafe { vpermilpd(a, b.as_i64x2()) }
1179}
1180
1181/// Shuffles double-precision (64-bit) floating-point elements in `a`
1182/// within 128-bit lanes using the control in `imm8`.
1183///
1184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1185#[inline]
1186#[target_feature(enable = "avx")]
1187#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1188#[rustc_legacy_const_generics(1)]
1189#[stable(feature = "simd_x86", since = "1.27.0")]
1190pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1191    static_assert_uimm_bits!(IMM4, 4);
1192    unsafe {
1193        simd_shuffle!(
1194            a,
1195            _mm256_undefined_pd(),
1196            [
1197                ((IMM4 as u32 >> 0) & 1),
1198                ((IMM4 as u32 >> 1) & 1),
1199                ((IMM4 as u32 >> 2) & 1) + 2,
1200                ((IMM4 as u32 >> 3) & 1) + 2,
1201            ],
1202        )
1203    }
1204}
1205
1206/// Shuffles double-precision (64-bit) floating-point elements in `a`
1207/// using the control in `imm8`.
1208///
1209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1210#[inline]
1211#[target_feature(enable = "avx")]
1212#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1213#[rustc_legacy_const_generics(1)]
1214#[stable(feature = "simd_x86", since = "1.27.0")]
1215pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1216    static_assert_uimm_bits!(IMM2, 2);
1217    unsafe {
1218        simd_shuffle!(
1219            a,
1220            _mm_undefined_pd(),
1221            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1222        )
1223    }
1224}
1225
1226/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1227/// floating-point elements) selected by `imm8` from `a` and `b`.
1228///
1229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1230#[inline]
1231#[target_feature(enable = "avx")]
1232#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1233#[rustc_legacy_const_generics(2)]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1236    static_assert_uimm_bits!(IMM8, 8);
1237    _mm256_castsi256_ps(_mm256_permute2f128_si256::<IMM8>(
1238        _mm256_castps_si256(a),
1239        _mm256_castps_si256(b),
1240    ))
1241}
1242
1243/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1244/// floating-point elements) selected by `imm8` from `a` and `b`.
1245///
1246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1247#[inline]
1248#[target_feature(enable = "avx")]
1249#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1250#[rustc_legacy_const_generics(2)]
1251#[stable(feature = "simd_x86", since = "1.27.0")]
1252pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1253    static_assert_uimm_bits!(IMM8, 8);
1254    _mm256_castsi256_pd(_mm256_permute2f128_si256::<IMM8>(
1255        _mm256_castpd_si256(a),
1256        _mm256_castpd_si256(b),
1257    ))
1258}
1259
1260/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1261/// from `a` and `b`.
1262///
1263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1264#[inline]
1265#[target_feature(enable = "avx")]
1266#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1267#[rustc_legacy_const_generics(2)]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1270    static_assert_uimm_bits!(IMM8, 8);
1271    const fn idx(imm8: i32, pos: u32) -> u32 {
1272        let part = if pos < 2 {
1273            imm8 & 0xf
1274        } else {
1275            (imm8 & 0xf0) >> 4
1276        };
1277        2 * (part as u32 & 0b11) + (pos & 1)
1278    }
1279    const fn idx0(imm8: i32, pos: u32) -> u32 {
1280        let part = if pos < 2 {
1281            imm8 & 0xf
1282        } else {
1283            (imm8 & 0xf0) >> 4
1284        };
1285        if part & 0b1000 != 0 { 4 } else { pos }
1286    }
1287    unsafe {
1288        let r = simd_shuffle!(
1289            a.as_i64x4(),
1290            b.as_i64x4(),
1291            [idx(IMM8, 0), idx(IMM8, 1), idx(IMM8, 2), idx(IMM8, 3)]
1292        );
1293        let r: i64x4 = simd_shuffle!(
1294            r,
1295            i64x4::ZERO,
1296            [idx0(IMM8, 0), idx0(IMM8, 1), idx0(IMM8, 2), idx0(IMM8, 3)]
1297        );
1298        r.as_m256i()
1299    }
1300}
1301
1302/// Broadcasts a single-precision (32-bit) floating-point element from memory
1303/// to all elements of the returned vector.
1304///
1305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1306#[inline]
1307#[target_feature(enable = "avx")]
1308#[cfg_attr(test, assert_instr(vbroadcastss))]
1309#[stable(feature = "simd_x86", since = "1.27.0")]
1310#[allow(clippy::trivially_copy_pass_by_ref)]
1311pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1312    _mm256_set1_ps(*f)
1313}
1314
1315/// Broadcasts a single-precision (32-bit) floating-point element from memory
1316/// to all elements of the returned vector.
1317///
1318/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1319#[inline]
1320#[target_feature(enable = "avx")]
1321#[cfg_attr(test, assert_instr(vbroadcastss))]
1322#[stable(feature = "simd_x86", since = "1.27.0")]
1323#[allow(clippy::trivially_copy_pass_by_ref)]
1324pub fn _mm_broadcast_ss(f: &f32) -> __m128 {
1325    _mm_set1_ps(*f)
1326}
1327
1328/// Broadcasts a double-precision (64-bit) floating-point element from memory
1329/// to all elements of the returned vector.
1330///
1331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1332#[inline]
1333#[target_feature(enable = "avx")]
1334#[cfg_attr(test, assert_instr(vbroadcastsd))]
1335#[stable(feature = "simd_x86", since = "1.27.0")]
1336#[allow(clippy::trivially_copy_pass_by_ref)]
1337pub fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1338    _mm256_set1_pd(*f)
1339}
1340
1341/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1342/// (32-bit) floating-point elements) to all elements of the returned vector.
1343///
1344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1345#[inline]
1346#[target_feature(enable = "avx")]
1347#[cfg_attr(test, assert_instr(vbroadcastf128))]
1348#[stable(feature = "simd_x86", since = "1.27.0")]
1349pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1350    unsafe { simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3]) }
1351}
1352
1353/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1354/// (64-bit) floating-point elements) to all elements of the returned vector.
1355///
1356/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1357#[inline]
1358#[target_feature(enable = "avx")]
1359#[cfg_attr(test, assert_instr(vbroadcastf128))]
1360#[stable(feature = "simd_x86", since = "1.27.0")]
1361pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1362    unsafe { simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1]) }
1363}
1364
1365/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1366/// single-precision (32-bit) floating-point elements) from `b` into result
1367/// at the location specified by `imm8`.
1368///
1369/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1370#[inline]
1371#[target_feature(enable = "avx")]
1372#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1373#[rustc_legacy_const_generics(2)]
1374#[stable(feature = "simd_x86", since = "1.27.0")]
1375pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1376    static_assert_uimm_bits!(IMM1, 1);
1377    unsafe {
1378        simd_shuffle!(
1379            a,
1380            _mm256_castps128_ps256(b),
1381            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1382        )
1383    }
1384}
1385
1386/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1387/// double-precision (64-bit) floating-point elements) from `b` into result
1388/// at the location specified by `imm8`.
1389///
1390/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1391#[inline]
1392#[target_feature(enable = "avx")]
1393#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1394#[rustc_legacy_const_generics(2)]
1395#[stable(feature = "simd_x86", since = "1.27.0")]
1396pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1397    static_assert_uimm_bits!(IMM1, 1);
1398    unsafe {
1399        simd_shuffle!(
1400            a,
1401            _mm256_castpd128_pd256(b),
1402            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1403        )
1404    }
1405}
1406
1407/// Copies `a` to result, then inserts 128 bits from `b` into result
1408/// at the location specified by `imm8`.
1409///
1410/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1411#[inline]
1412#[target_feature(enable = "avx")]
1413#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1414#[rustc_legacy_const_generics(2)]
1415#[stable(feature = "simd_x86", since = "1.27.0")]
1416pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1417    static_assert_uimm_bits!(IMM1, 1);
1418    unsafe {
1419        let dst: i64x4 = simd_shuffle!(
1420            a.as_i64x4(),
1421            _mm256_castsi128_si256(b).as_i64x4(),
1422            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1423        );
1424        transmute(dst)
1425    }
1426}
1427
1428/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1429/// at the location specified by `index`.
1430///
1431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1432#[inline]
1433#[target_feature(enable = "avx")]
1434// This intrinsic has no corresponding instruction.
1435#[rustc_legacy_const_generics(2)]
1436#[stable(feature = "simd_x86", since = "1.27.0")]
1437pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1438    static_assert_uimm_bits!(INDEX, 5);
1439    unsafe { transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i)) }
1440}
1441
1442/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1443/// at the location specified by `index`.
1444///
1445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1446#[inline]
1447#[target_feature(enable = "avx")]
1448// This intrinsic has no corresponding instruction.
1449#[rustc_legacy_const_generics(2)]
1450#[stable(feature = "simd_x86", since = "1.27.0")]
1451pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1452    static_assert_uimm_bits!(INDEX, 4);
1453    unsafe { transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i)) }
1454}
1455
1456/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1457/// at the location specified by `index`.
1458///
1459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1460#[inline]
1461#[target_feature(enable = "avx")]
1462// This intrinsic has no corresponding instruction.
1463#[rustc_legacy_const_generics(2)]
1464#[stable(feature = "simd_x86", since = "1.27.0")]
1465pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1466    static_assert_uimm_bits!(INDEX, 3);
1467    unsafe { transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i)) }
1468}
1469
1470/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1471/// floating-point elements) from memory into result.
1472/// `mem_addr` must be aligned on a 32-byte boundary or a
1473/// general-protection exception may be generated.
1474///
1475/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1476#[inline]
1477#[target_feature(enable = "avx")]
1478#[cfg_attr(
1479    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1480    assert_instr(vmovap)
1481)]
1482#[stable(feature = "simd_x86", since = "1.27.0")]
1483#[allow(clippy::cast_ptr_alignment)]
1484pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1485    *(mem_addr as *const __m256d)
1486}
1487
1488/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1489/// floating-point elements) from `a` into memory.
1490/// `mem_addr` must be aligned on a 32-byte boundary or a
1491/// general-protection exception may be generated.
1492///
1493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1494#[inline]
1495#[target_feature(enable = "avx")]
1496#[cfg_attr(
1497    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1498    assert_instr(vmovap)
1499)]
1500#[stable(feature = "simd_x86", since = "1.27.0")]
1501#[allow(clippy::cast_ptr_alignment)]
1502pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1503    *(mem_addr as *mut __m256d) = a;
1504}
1505
1506/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1507/// floating-point elements) from memory into result.
1508/// `mem_addr` must be aligned on a 32-byte boundary or a
1509/// general-protection exception may be generated.
1510///
1511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1512#[inline]
1513#[target_feature(enable = "avx")]
1514#[cfg_attr(
1515    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1516    assert_instr(vmovaps)
1517)]
1518#[stable(feature = "simd_x86", since = "1.27.0")]
1519#[allow(clippy::cast_ptr_alignment)]
1520pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1521    *(mem_addr as *const __m256)
1522}
1523
1524/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1525/// floating-point elements) from `a` into memory.
1526/// `mem_addr` must be aligned on a 32-byte boundary or a
1527/// general-protection exception may be generated.
1528///
1529/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1530#[inline]
1531#[target_feature(enable = "avx")]
1532#[cfg_attr(
1533    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1534    assert_instr(vmovaps)
1535)]
1536#[stable(feature = "simd_x86", since = "1.27.0")]
1537#[allow(clippy::cast_ptr_alignment)]
1538pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1539    *(mem_addr as *mut __m256) = a;
1540}
1541
1542/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1543/// floating-point elements) from memory into result.
1544/// `mem_addr` does not need to be aligned on any particular boundary.
1545///
1546/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1547#[inline]
1548#[target_feature(enable = "avx")]
1549#[cfg_attr(test, assert_instr(vmovup))]
1550#[stable(feature = "simd_x86", since = "1.27.0")]
1551pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1552    let mut dst = _mm256_undefined_pd();
1553    ptr::copy_nonoverlapping(
1554        mem_addr as *const u8,
1555        ptr::addr_of_mut!(dst) as *mut u8,
1556        mem::size_of::<__m256d>(),
1557    );
1558    dst
1559}
1560
1561/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1562/// floating-point elements) from `a` into memory.
1563/// `mem_addr` does not need to be aligned on any particular boundary.
1564///
1565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1566#[inline]
1567#[target_feature(enable = "avx")]
1568#[cfg_attr(test, assert_instr(vmovup))]
1569#[stable(feature = "simd_x86", since = "1.27.0")]
1570pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1571    mem_addr.cast::<__m256d>().write_unaligned(a);
1572}
1573
1574/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1575/// floating-point elements) from memory into result.
1576/// `mem_addr` does not need to be aligned on any particular boundary.
1577///
1578/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1579#[inline]
1580#[target_feature(enable = "avx")]
1581#[cfg_attr(test, assert_instr(vmovups))]
1582#[stable(feature = "simd_x86", since = "1.27.0")]
1583pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1584    let mut dst = _mm256_undefined_ps();
1585    ptr::copy_nonoverlapping(
1586        mem_addr as *const u8,
1587        ptr::addr_of_mut!(dst) as *mut u8,
1588        mem::size_of::<__m256>(),
1589    );
1590    dst
1591}
1592
1593/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1594/// floating-point elements) from `a` into memory.
1595/// `mem_addr` does not need to be aligned on any particular boundary.
1596///
1597/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1598#[inline]
1599#[target_feature(enable = "avx")]
1600#[cfg_attr(test, assert_instr(vmovups))]
1601#[stable(feature = "simd_x86", since = "1.27.0")]
1602pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1603    mem_addr.cast::<__m256>().write_unaligned(a);
1604}
1605
1606/// Loads 256-bits of integer data from memory into result.
1607/// `mem_addr` must be aligned on a 32-byte boundary or a
1608/// general-protection exception may be generated.
1609///
1610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1611#[inline]
1612#[target_feature(enable = "avx")]
1613#[cfg_attr(
1614    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1615    assert_instr(vmovaps)
1616)] // FIXME vmovdqa expected
1617#[stable(feature = "simd_x86", since = "1.27.0")]
1618pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1619    *mem_addr
1620}
1621
1622/// Stores 256-bits of integer data from `a` into memory.
1623/// `mem_addr` must be aligned on a 32-byte boundary or a
1624/// general-protection exception may be generated.
1625///
1626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1627#[inline]
1628#[target_feature(enable = "avx")]
1629#[cfg_attr(
1630    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1631    assert_instr(vmovaps)
1632)] // FIXME vmovdqa expected
1633#[stable(feature = "simd_x86", since = "1.27.0")]
1634pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1635    *mem_addr = a;
1636}
1637
1638/// Loads 256-bits of integer data from memory into result.
1639/// `mem_addr` does not need to be aligned on any particular boundary.
1640///
1641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1642#[inline]
1643#[target_feature(enable = "avx")]
1644#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1645#[stable(feature = "simd_x86", since = "1.27.0")]
1646pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1647    let mut dst = _mm256_undefined_si256();
1648    ptr::copy_nonoverlapping(
1649        mem_addr as *const u8,
1650        ptr::addr_of_mut!(dst) as *mut u8,
1651        mem::size_of::<__m256i>(),
1652    );
1653    dst
1654}
1655
1656/// Stores 256-bits of integer data from `a` into memory.
1657/// `mem_addr` does not need to be aligned on any particular boundary.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1660#[inline]
1661#[target_feature(enable = "avx")]
1662#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1665    mem_addr.write_unaligned(a);
1666}
1667
1668/// Loads packed double-precision (64-bit) floating-point elements from memory
1669/// into result using `mask` (elements are zeroed out when the high bit of the
1670/// corresponding element is not set).
1671///
1672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1673#[inline]
1674#[target_feature(enable = "avx")]
1675#[cfg_attr(test, assert_instr(vmaskmovpd))]
1676#[stable(feature = "simd_x86", since = "1.27.0")]
1677pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1678    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1679}
1680
1681/// Stores packed double-precision (64-bit) floating-point elements from `a`
1682/// into memory using `mask`.
1683///
1684/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1685#[inline]
1686#[target_feature(enable = "avx")]
1687#[cfg_attr(test, assert_instr(vmaskmovpd))]
1688#[stable(feature = "simd_x86", since = "1.27.0")]
1689pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1690    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1691}
1692
1693/// Loads packed double-precision (64-bit) floating-point elements from memory
1694/// into result using `mask` (elements are zeroed out when the high bit of the
1695/// corresponding element is not set).
1696///
1697/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1698#[inline]
1699#[target_feature(enable = "avx")]
1700#[cfg_attr(test, assert_instr(vmaskmovpd))]
1701#[stable(feature = "simd_x86", since = "1.27.0")]
1702pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1703    maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1704}
1705
1706/// Stores packed double-precision (64-bit) floating-point elements from `a`
1707/// into memory using `mask`.
1708///
1709/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1710#[inline]
1711#[target_feature(enable = "avx")]
1712#[cfg_attr(test, assert_instr(vmaskmovpd))]
1713#[stable(feature = "simd_x86", since = "1.27.0")]
1714pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1715    maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1716}
1717
1718/// Loads packed single-precision (32-bit) floating-point elements from memory
1719/// into result using `mask` (elements are zeroed out when the high bit of the
1720/// corresponding element is not set).
1721///
1722/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1723#[inline]
1724#[target_feature(enable = "avx")]
1725#[cfg_attr(test, assert_instr(vmaskmovps))]
1726#[stable(feature = "simd_x86", since = "1.27.0")]
1727pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1728    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1729}
1730
1731/// Stores packed single-precision (32-bit) floating-point elements from `a`
1732/// into memory using `mask`.
1733///
1734/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1735#[inline]
1736#[target_feature(enable = "avx")]
1737#[cfg_attr(test, assert_instr(vmaskmovps))]
1738#[stable(feature = "simd_x86", since = "1.27.0")]
1739pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1740    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1741}
1742
1743/// Loads packed single-precision (32-bit) floating-point elements from memory
1744/// into result using `mask` (elements are zeroed out when the high bit of the
1745/// corresponding element is not set).
1746///
1747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1748#[inline]
1749#[target_feature(enable = "avx")]
1750#[cfg_attr(test, assert_instr(vmaskmovps))]
1751#[stable(feature = "simd_x86", since = "1.27.0")]
1752pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1753    maskloadps(mem_addr as *const i8, mask.as_i32x4())
1754}
1755
1756/// Stores packed single-precision (32-bit) floating-point elements from `a`
1757/// into memory using `mask`.
1758///
1759/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1760#[inline]
1761#[target_feature(enable = "avx")]
1762#[cfg_attr(test, assert_instr(vmaskmovps))]
1763#[stable(feature = "simd_x86", since = "1.27.0")]
1764pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1765    maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1766}
1767
1768/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1769/// from `a`, and returns the results.
1770///
1771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1772#[inline]
1773#[target_feature(enable = "avx")]
1774#[cfg_attr(test, assert_instr(vmovshdup))]
1775#[stable(feature = "simd_x86", since = "1.27.0")]
1776pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1777    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) }
1778}
1779
1780/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1781/// from `a`, and returns the results.
1782///
1783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1784#[inline]
1785#[target_feature(enable = "avx")]
1786#[cfg_attr(test, assert_instr(vmovsldup))]
1787#[stable(feature = "simd_x86", since = "1.27.0")]
1788pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1789    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) }
1790}
1791
1792/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1793/// from `a`, and returns the results.
1794///
1795/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1796#[inline]
1797#[target_feature(enable = "avx")]
1798#[cfg_attr(test, assert_instr(vmovddup))]
1799#[stable(feature = "simd_x86", since = "1.27.0")]
1800pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1801    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
1802}
1803
1804/// Loads 256-bits of integer data from unaligned memory into result.
1805/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1806/// data crosses a cache line boundary.
1807///
1808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1809#[inline]
1810#[target_feature(enable = "avx")]
1811#[cfg_attr(test, assert_instr(vlddqu))]
1812#[stable(feature = "simd_x86", since = "1.27.0")]
1813pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1814    transmute(vlddqu(mem_addr as *const i8))
1815}
1816
1817/// Moves integer data from a 256-bit integer vector to a 32-byte
1818/// aligned memory location. To minimize caching, the data is flagged as
1819/// non-temporal (unlikely to be used again soon)
1820///
1821/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1822///
1823/// # Safety of non-temporal stores
1824///
1825/// After using this intrinsic, but before any other access to the memory that this intrinsic
1826/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1827/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1828/// return.
1829///
1830/// See [`_mm_sfence`] for details.
1831#[inline]
1832#[target_feature(enable = "avx")]
1833#[cfg_attr(test, assert_instr(vmovntdq))]
1834#[stable(feature = "simd_x86", since = "1.27.0")]
1835pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1836    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1837    crate::arch::asm!(
1838        vps!("vmovntdq", ",{a}"),
1839        p = in(reg) mem_addr,
1840        a = in(ymm_reg) a,
1841        options(nostack, preserves_flags),
1842    );
1843}
1844
1845/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1846/// to a 32-byte aligned memory location. To minimize caching, the data is
1847/// flagged as non-temporal (unlikely to be used again soon).
1848///
1849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1850///
1851/// # Safety of non-temporal stores
1852///
1853/// After using this intrinsic, but before any other access to the memory that this intrinsic
1854/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1855/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1856/// return.
1857///
1858/// See [`_mm_sfence`] for details.
1859#[inline]
1860#[target_feature(enable = "avx")]
1861#[cfg_attr(test, assert_instr(vmovntpd))]
1862#[stable(feature = "simd_x86", since = "1.27.0")]
1863#[allow(clippy::cast_ptr_alignment)]
1864pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1865    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1866    crate::arch::asm!(
1867        vps!("vmovntpd", ",{a}"),
1868        p = in(reg) mem_addr,
1869        a = in(ymm_reg) a,
1870        options(nostack, preserves_flags),
1871    );
1872}
1873
1874/// Moves single-precision floating point values from a 256-bit vector
1875/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1876/// caching, the data is flagged as non-temporal (unlikely to be used again
1877/// soon).
1878///
1879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1880///
1881/// # Safety of non-temporal stores
1882///
1883/// After using this intrinsic, but before any other access to the memory that this intrinsic
1884/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1885/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1886/// return.
1887///
1888/// See [`_mm_sfence`] for details.
1889#[inline]
1890#[target_feature(enable = "avx")]
1891#[cfg_attr(test, assert_instr(vmovntps))]
1892#[stable(feature = "simd_x86", since = "1.27.0")]
1893#[allow(clippy::cast_ptr_alignment)]
1894pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1895    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
1896    crate::arch::asm!(
1897        vps!("vmovntps", ",{a}"),
1898        p = in(reg) mem_addr,
1899        a = in(ymm_reg) a,
1900        options(nostack, preserves_flags),
1901    );
1902}
1903
1904/// Computes the approximate reciprocal of packed single-precision (32-bit)
1905/// floating-point elements in `a`, and returns the results. The maximum
1906/// relative error for this approximation is less than 1.5*2^-12.
1907///
1908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
1909#[inline]
1910#[target_feature(enable = "avx")]
1911#[cfg_attr(test, assert_instr(vrcpps))]
1912#[stable(feature = "simd_x86", since = "1.27.0")]
1913pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
1914    unsafe { vrcpps(a) }
1915}
1916
1917/// Computes the approximate reciprocal square root of packed single-precision
1918/// (32-bit) floating-point elements in `a`, and returns the results.
1919/// The maximum relative error for this approximation is less than 1.5*2^-12.
1920///
1921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
1922#[inline]
1923#[target_feature(enable = "avx")]
1924#[cfg_attr(test, assert_instr(vrsqrtps))]
1925#[stable(feature = "simd_x86", since = "1.27.0")]
1926pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1927    unsafe { vrsqrtps(a) }
1928}
1929
1930/// Unpacks and interleave double-precision (64-bit) floating-point elements
1931/// from the high half of each 128-bit lane in `a` and `b`.
1932///
1933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
1934#[inline]
1935#[target_feature(enable = "avx")]
1936#[cfg_attr(test, assert_instr(vunpckhpd))]
1937#[stable(feature = "simd_x86", since = "1.27.0")]
1938pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
1939    unsafe { simd_shuffle!(a, b, [1, 5, 3, 7]) }
1940}
1941
1942/// Unpacks and interleave single-precision (32-bit) floating-point elements
1943/// from the high half of each 128-bit lane in `a` and `b`.
1944///
1945/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
1946#[inline]
1947#[target_feature(enable = "avx")]
1948#[cfg_attr(test, assert_instr(vunpckhps))]
1949#[stable(feature = "simd_x86", since = "1.27.0")]
1950pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
1951    unsafe { simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) }
1952}
1953
1954/// Unpacks and interleave double-precision (64-bit) floating-point elements
1955/// from the low half of each 128-bit lane in `a` and `b`.
1956///
1957/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
1958#[inline]
1959#[target_feature(enable = "avx")]
1960#[cfg_attr(test, assert_instr(vunpcklpd))]
1961#[stable(feature = "simd_x86", since = "1.27.0")]
1962pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
1963    unsafe { simd_shuffle!(a, b, [0, 4, 2, 6]) }
1964}
1965
1966/// Unpacks and interleave single-precision (32-bit) floating-point elements
1967/// from the low half of each 128-bit lane in `a` and `b`.
1968///
1969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
1970#[inline]
1971#[target_feature(enable = "avx")]
1972#[cfg_attr(test, assert_instr(vunpcklps))]
1973#[stable(feature = "simd_x86", since = "1.27.0")]
1974pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1975    unsafe { simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) }
1976}
1977
1978/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1979/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1980/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1981/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
1982///
1983/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
1984#[inline]
1985#[target_feature(enable = "avx")]
1986#[cfg_attr(test, assert_instr(vptest))]
1987#[stable(feature = "simd_x86", since = "1.27.0")]
1988pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1989    unsafe {
1990        let r = simd_and(a.as_i64x4(), b.as_i64x4());
1991        (0i64 == simd_reduce_or(r)) as i32
1992    }
1993}
1994
1995/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1996/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1997/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1998/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
1999///
2000/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
2001#[inline]
2002#[target_feature(enable = "avx")]
2003#[cfg_attr(test, assert_instr(vptest))]
2004#[stable(feature = "simd_x86", since = "1.27.0")]
2005pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
2006    unsafe {
2007        let r = simd_and(simd_xor(a.as_i64x4(), i64x4::splat(!0)), b.as_i64x4());
2008        (0i64 == simd_reduce_or(r)) as i32
2009    }
2010}
2011
2012/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
2013/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
2014/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
2015/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
2016/// `CF` values are zero, otherwise return 0.
2017///
2018/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
2019#[inline]
2020#[target_feature(enable = "avx")]
2021#[cfg_attr(test, assert_instr(vptest))]
2022#[stable(feature = "simd_x86", since = "1.27.0")]
2023pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
2024    unsafe { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
2025}
2026
2027/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2028/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2029/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2030/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2031/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2032/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2033/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2034///
2035/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
2036#[inline]
2037#[target_feature(enable = "avx")]
2038#[cfg_attr(test, assert_instr(vtestpd))]
2039#[stable(feature = "simd_x86", since = "1.27.0")]
2040pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
2041    unsafe { vtestzpd256(a, b) }
2042}
2043
2044/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2045/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2046/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2047/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2048/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2049/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2050/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2051///
2052/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
2053#[inline]
2054#[target_feature(enable = "avx")]
2055#[cfg_attr(test, assert_instr(vtestpd))]
2056#[stable(feature = "simd_x86", since = "1.27.0")]
2057pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
2058    unsafe { vtestcpd256(a, b) }
2059}
2060
2061/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
2062/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2063/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2064/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2065/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2066/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2067/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2068/// are zero, otherwise return 0.
2069///
2070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
2071#[inline]
2072#[target_feature(enable = "avx")]
2073#[cfg_attr(test, assert_instr(vtestpd))]
2074#[stable(feature = "simd_x86", since = "1.27.0")]
2075pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
2076    unsafe { vtestnzcpd256(a, b) }
2077}
2078
2079/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2080/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2081/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2082/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2083/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2084/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2085/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2086///
2087/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
2088#[inline]
2089#[target_feature(enable = "avx")]
2090#[cfg_attr(test, assert_instr(vtestpd))]
2091#[stable(feature = "simd_x86", since = "1.27.0")]
2092pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
2093    unsafe {
2094        let r: i64x2 = simd_lt(transmute(_mm_and_pd(a, b)), i64x2::ZERO);
2095        (0i64 == simd_reduce_or(r)) as i32
2096    }
2097}
2098
2099/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2100/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2101/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2102/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2103/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2104/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2105/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2106///
2107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
2108#[inline]
2109#[target_feature(enable = "avx")]
2110#[cfg_attr(test, assert_instr(vtestpd))]
2111#[stable(feature = "simd_x86", since = "1.27.0")]
2112pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2113    unsafe {
2114        let r: i64x2 = simd_lt(transmute(_mm_andnot_pd(a, b)), i64x2::ZERO);
2115        (0i64 == simd_reduce_or(r)) as i32
2116    }
2117}
2118
2119/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2120/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2121/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2122/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2123/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2124/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2125/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2126/// are zero, otherwise return 0.
2127///
2128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2129#[inline]
2130#[target_feature(enable = "avx")]
2131#[cfg_attr(test, assert_instr(vtestpd))]
2132#[stable(feature = "simd_x86", since = "1.27.0")]
2133pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2134    unsafe { vtestnzcpd(a, b) }
2135}
2136
2137/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2138/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2139/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2140/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2141/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2142/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2143/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2144///
2145/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2146#[inline]
2147#[target_feature(enable = "avx")]
2148#[cfg_attr(test, assert_instr(vtestps))]
2149#[stable(feature = "simd_x86", since = "1.27.0")]
2150pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2151    unsafe { vtestzps256(a, b) }
2152}
2153
2154/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2155/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2156/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2157/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2158/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2159/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2160/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2161///
2162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2163#[inline]
2164#[target_feature(enable = "avx")]
2165#[cfg_attr(test, assert_instr(vtestps))]
2166#[stable(feature = "simd_x86", since = "1.27.0")]
2167pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2168    unsafe { vtestcps256(a, b) }
2169}
2170
2171/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2172/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2173/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2174/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2175/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2176/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2177/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2178/// are zero, otherwise return 0.
2179///
2180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2181#[inline]
2182#[target_feature(enable = "avx")]
2183#[cfg_attr(test, assert_instr(vtestps))]
2184#[stable(feature = "simd_x86", since = "1.27.0")]
2185pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2186    unsafe { vtestnzcps256(a, b) }
2187}
2188
2189/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2190/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2191/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2192/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2193/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2194/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2195/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2196///
2197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2198#[inline]
2199#[target_feature(enable = "avx")]
2200#[cfg_attr(test, assert_instr(vtestps))]
2201#[stable(feature = "simd_x86", since = "1.27.0")]
2202pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2203    unsafe {
2204        let r: i32x4 = simd_lt(transmute(_mm_and_ps(a, b)), i32x4::ZERO);
2205        (0i32 == simd_reduce_or(r)) as i32
2206    }
2207}
2208
2209/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2210/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2211/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2212/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2213/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2214/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2215/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2216///
2217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2218#[inline]
2219#[target_feature(enable = "avx")]
2220#[cfg_attr(test, assert_instr(vtestps))]
2221#[stable(feature = "simd_x86", since = "1.27.0")]
2222pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2223    unsafe {
2224        let r: i32x4 = simd_lt(transmute(_mm_andnot_ps(a, b)), i32x4::ZERO);
2225        (0i32 == simd_reduce_or(r)) as i32
2226    }
2227}
2228
2229/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2230/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2231/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2232/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2233/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2234/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2235/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2236/// are zero, otherwise return 0.
2237///
2238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2239#[inline]
2240#[target_feature(enable = "avx")]
2241#[cfg_attr(test, assert_instr(vtestps))]
2242#[stable(feature = "simd_x86", since = "1.27.0")]
2243pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2244    unsafe { vtestnzcps(a, b) }
2245}
2246
2247/// Sets each bit of the returned mask based on the most significant bit of the
2248/// corresponding packed double-precision (64-bit) floating-point element in
2249/// `a`.
2250///
2251/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2252#[inline]
2253#[target_feature(enable = "avx")]
2254#[cfg_attr(test, assert_instr(vmovmskpd))]
2255#[stable(feature = "simd_x86", since = "1.27.0")]
2256pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
2257    // Propagate the highest bit to the rest, because simd_bitmask
2258    // requires all-1 or all-0.
2259    unsafe {
2260        let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2261        simd_bitmask::<i64x4, u8>(mask).into()
2262    }
2263}
2264
2265/// Sets each bit of the returned mask based on the most significant bit of the
2266/// corresponding packed single-precision (32-bit) floating-point element in
2267/// `a`.
2268///
2269/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2270#[inline]
2271#[target_feature(enable = "avx")]
2272#[cfg_attr(test, assert_instr(vmovmskps))]
2273#[stable(feature = "simd_x86", since = "1.27.0")]
2274pub fn _mm256_movemask_ps(a: __m256) -> i32 {
2275    // Propagate the highest bit to the rest, because simd_bitmask
2276    // requires all-1 or all-0.
2277    unsafe {
2278        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2279        simd_bitmask::<i32x8, u8>(mask).into()
2280    }
2281}
2282
2283/// Returns vector of type __m256d with all elements set to zero.
2284///
2285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2286#[inline]
2287#[target_feature(enable = "avx")]
2288#[cfg_attr(test, assert_instr(vxorp))]
2289#[stable(feature = "simd_x86", since = "1.27.0")]
2290pub fn _mm256_setzero_pd() -> __m256d {
2291    const { unsafe { mem::zeroed() } }
2292}
2293
2294/// Returns vector of type __m256 with all elements set to zero.
2295///
2296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2297#[inline]
2298#[target_feature(enable = "avx")]
2299#[cfg_attr(test, assert_instr(vxorps))]
2300#[stable(feature = "simd_x86", since = "1.27.0")]
2301pub fn _mm256_setzero_ps() -> __m256 {
2302    const { unsafe { mem::zeroed() } }
2303}
2304
2305/// Returns vector of type __m256i with all elements set to zero.
2306///
2307/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2308#[inline]
2309#[target_feature(enable = "avx")]
2310#[cfg_attr(test, assert_instr(vxor))]
2311#[stable(feature = "simd_x86", since = "1.27.0")]
2312pub fn _mm256_setzero_si256() -> __m256i {
2313    const { unsafe { mem::zeroed() } }
2314}
2315
2316/// Sets packed double-precision (64-bit) floating-point elements in returned
2317/// vector with the supplied values.
2318///
2319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2320#[inline]
2321#[target_feature(enable = "avx")]
2322// This intrinsic has no corresponding instruction.
2323#[cfg_attr(test, assert_instr(vinsertf128))]
2324#[stable(feature = "simd_x86", since = "1.27.0")]
2325pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2326    _mm256_setr_pd(d, c, b, a)
2327}
2328
2329/// Sets packed single-precision (32-bit) floating-point elements in returned
2330/// vector with the supplied values.
2331///
2332/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2333#[inline]
2334#[target_feature(enable = "avx")]
2335// This intrinsic has no corresponding instruction.
2336#[stable(feature = "simd_x86", since = "1.27.0")]
2337pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2338    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2339}
2340
2341/// Sets packed 8-bit integers in returned vector with the supplied values.
2342///
2343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2344#[inline]
2345#[target_feature(enable = "avx")]
2346// This intrinsic has no corresponding instruction.
2347#[stable(feature = "simd_x86", since = "1.27.0")]
2348pub fn _mm256_set_epi8(
2349    e00: i8,
2350    e01: i8,
2351    e02: i8,
2352    e03: i8,
2353    e04: i8,
2354    e05: i8,
2355    e06: i8,
2356    e07: i8,
2357    e08: i8,
2358    e09: i8,
2359    e10: i8,
2360    e11: i8,
2361    e12: i8,
2362    e13: i8,
2363    e14: i8,
2364    e15: i8,
2365    e16: i8,
2366    e17: i8,
2367    e18: i8,
2368    e19: i8,
2369    e20: i8,
2370    e21: i8,
2371    e22: i8,
2372    e23: i8,
2373    e24: i8,
2374    e25: i8,
2375    e26: i8,
2376    e27: i8,
2377    e28: i8,
2378    e29: i8,
2379    e30: i8,
2380    e31: i8,
2381) -> __m256i {
2382    #[rustfmt::skip]
2383    _mm256_setr_epi8(
2384        e31, e30, e29, e28, e27, e26, e25, e24,
2385        e23, e22, e21, e20, e19, e18, e17, e16,
2386        e15, e14, e13, e12, e11, e10, e09, e08,
2387        e07, e06, e05, e04, e03, e02, e01, e00,
2388    )
2389}
2390
2391/// Sets packed 16-bit integers in returned vector with the supplied values.
2392///
2393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2394#[inline]
2395#[target_feature(enable = "avx")]
2396// This intrinsic has no corresponding instruction.
2397#[stable(feature = "simd_x86", since = "1.27.0")]
2398pub fn _mm256_set_epi16(
2399    e00: i16,
2400    e01: i16,
2401    e02: i16,
2402    e03: i16,
2403    e04: i16,
2404    e05: i16,
2405    e06: i16,
2406    e07: i16,
2407    e08: i16,
2408    e09: i16,
2409    e10: i16,
2410    e11: i16,
2411    e12: i16,
2412    e13: i16,
2413    e14: i16,
2414    e15: i16,
2415) -> __m256i {
2416    #[rustfmt::skip]
2417    _mm256_setr_epi16(
2418        e15, e14, e13, e12,
2419        e11, e10, e09, e08,
2420        e07, e06, e05, e04,
2421        e03, e02, e01, e00,
2422    )
2423}
2424
2425/// Sets packed 32-bit integers in returned vector with the supplied values.
2426///
2427/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2428#[inline]
2429#[target_feature(enable = "avx")]
2430// This intrinsic has no corresponding instruction.
2431#[stable(feature = "simd_x86", since = "1.27.0")]
2432pub fn _mm256_set_epi32(
2433    e0: i32,
2434    e1: i32,
2435    e2: i32,
2436    e3: i32,
2437    e4: i32,
2438    e5: i32,
2439    e6: i32,
2440    e7: i32,
2441) -> __m256i {
2442    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2443}
2444
2445/// Sets packed 64-bit integers in returned vector with the supplied values.
2446///
2447/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2448#[inline]
2449#[target_feature(enable = "avx")]
2450// This intrinsic has no corresponding instruction.
2451#[stable(feature = "simd_x86", since = "1.27.0")]
2452pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2453    _mm256_setr_epi64x(d, c, b, a)
2454}
2455
2456/// Sets packed double-precision (64-bit) floating-point elements in returned
2457/// vector with the supplied values in reverse order.
2458///
2459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2460#[inline]
2461#[target_feature(enable = "avx")]
2462// This intrinsic has no corresponding instruction.
2463#[stable(feature = "simd_x86", since = "1.27.0")]
2464pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2465    __m256d([a, b, c, d])
2466}
2467
2468/// Sets packed single-precision (32-bit) floating-point elements in returned
2469/// vector with the supplied values in reverse order.
2470///
2471/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2472#[inline]
2473#[target_feature(enable = "avx")]
2474// This intrinsic has no corresponding instruction.
2475#[stable(feature = "simd_x86", since = "1.27.0")]
2476pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
2477    __m256([a, b, c, d, e, f, g, h])
2478}
2479
2480/// Sets packed 8-bit integers in returned vector with the supplied values in
2481/// reverse order.
2482///
2483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2484#[inline]
2485#[target_feature(enable = "avx")]
2486// This intrinsic has no corresponding instruction.
2487#[stable(feature = "simd_x86", since = "1.27.0")]
2488pub fn _mm256_setr_epi8(
2489    e00: i8,
2490    e01: i8,
2491    e02: i8,
2492    e03: i8,
2493    e04: i8,
2494    e05: i8,
2495    e06: i8,
2496    e07: i8,
2497    e08: i8,
2498    e09: i8,
2499    e10: i8,
2500    e11: i8,
2501    e12: i8,
2502    e13: i8,
2503    e14: i8,
2504    e15: i8,
2505    e16: i8,
2506    e17: i8,
2507    e18: i8,
2508    e19: i8,
2509    e20: i8,
2510    e21: i8,
2511    e22: i8,
2512    e23: i8,
2513    e24: i8,
2514    e25: i8,
2515    e26: i8,
2516    e27: i8,
2517    e28: i8,
2518    e29: i8,
2519    e30: i8,
2520    e31: i8,
2521) -> __m256i {
2522    unsafe {
2523        #[rustfmt::skip]
2524        transmute(i8x32::new(
2525            e00, e01, e02, e03, e04, e05, e06, e07,
2526            e08, e09, e10, e11, e12, e13, e14, e15,
2527            e16, e17, e18, e19, e20, e21, e22, e23,
2528            e24, e25, e26, e27, e28, e29, e30, e31,
2529        ))
2530    }
2531}
2532
2533/// Sets packed 16-bit integers in returned vector with the supplied values in
2534/// reverse order.
2535///
2536/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2537#[inline]
2538#[target_feature(enable = "avx")]
2539// This intrinsic has no corresponding instruction.
2540#[stable(feature = "simd_x86", since = "1.27.0")]
2541pub fn _mm256_setr_epi16(
2542    e00: i16,
2543    e01: i16,
2544    e02: i16,
2545    e03: i16,
2546    e04: i16,
2547    e05: i16,
2548    e06: i16,
2549    e07: i16,
2550    e08: i16,
2551    e09: i16,
2552    e10: i16,
2553    e11: i16,
2554    e12: i16,
2555    e13: i16,
2556    e14: i16,
2557    e15: i16,
2558) -> __m256i {
2559    unsafe {
2560        #[rustfmt::skip]
2561        transmute(i16x16::new(
2562            e00, e01, e02, e03,
2563            e04, e05, e06, e07,
2564            e08, e09, e10, e11,
2565            e12, e13, e14, e15,
2566        ))
2567    }
2568}
2569
2570/// Sets packed 32-bit integers in returned vector with the supplied values in
2571/// reverse order.
2572///
2573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2574#[inline]
2575#[target_feature(enable = "avx")]
2576// This intrinsic has no corresponding instruction.
2577#[stable(feature = "simd_x86", since = "1.27.0")]
2578pub fn _mm256_setr_epi32(
2579    e0: i32,
2580    e1: i32,
2581    e2: i32,
2582    e3: i32,
2583    e4: i32,
2584    e5: i32,
2585    e6: i32,
2586    e7: i32,
2587) -> __m256i {
2588    unsafe { transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) }
2589}
2590
2591/// Sets packed 64-bit integers in returned vector with the supplied values in
2592/// reverse order.
2593///
2594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2595#[inline]
2596#[target_feature(enable = "avx")]
2597// This intrinsic has no corresponding instruction.
2598#[stable(feature = "simd_x86", since = "1.27.0")]
2599pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2600    unsafe { transmute(i64x4::new(a, b, c, d)) }
2601}
2602
2603/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2604/// elements of returned vector.
2605///
2606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2607#[inline]
2608#[target_feature(enable = "avx")]
2609// This intrinsic has no corresponding instruction.
2610#[stable(feature = "simd_x86", since = "1.27.0")]
2611pub fn _mm256_set1_pd(a: f64) -> __m256d {
2612    _mm256_setr_pd(a, a, a, a)
2613}
2614
2615/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2616/// elements of returned vector.
2617///
2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2619#[inline]
2620#[target_feature(enable = "avx")]
2621// This intrinsic has no corresponding instruction.
2622#[stable(feature = "simd_x86", since = "1.27.0")]
2623pub fn _mm256_set1_ps(a: f32) -> __m256 {
2624    _mm256_setr_ps(a, a, a, a, a, a, a, a)
2625}
2626
2627/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2628/// This intrinsic may generate the `vpbroadcastb`.
2629///
2630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2631#[inline]
2632#[target_feature(enable = "avx")]
2633// This intrinsic has no corresponding instruction.
2634#[stable(feature = "simd_x86", since = "1.27.0")]
2635pub fn _mm256_set1_epi8(a: i8) -> __m256i {
2636    #[rustfmt::skip]
2637    _mm256_setr_epi8(
2638        a, a, a, a, a, a, a, a,
2639        a, a, a, a, a, a, a, a,
2640        a, a, a, a, a, a, a, a,
2641        a, a, a, a, a, a, a, a,
2642    )
2643}
2644
2645/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2646/// This intrinsic may generate the `vpbroadcastw`.
2647///
2648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2649#[inline]
2650#[target_feature(enable = "avx")]
2651//#[cfg_attr(test, assert_instr(vpshufb))]
2652#[cfg_attr(test, assert_instr(vinsertf128))]
2653// This intrinsic has no corresponding instruction.
2654#[stable(feature = "simd_x86", since = "1.27.0")]
2655pub fn _mm256_set1_epi16(a: i16) -> __m256i {
2656    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2657}
2658
2659/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2660/// This intrinsic may generate the `vpbroadcastd`.
2661///
2662/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2663#[inline]
2664#[target_feature(enable = "avx")]
2665// This intrinsic has no corresponding instruction.
2666#[stable(feature = "simd_x86", since = "1.27.0")]
2667pub fn _mm256_set1_epi32(a: i32) -> __m256i {
2668    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2669}
2670
2671/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2672/// This intrinsic may generate the `vpbroadcastq`.
2673///
2674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2675#[inline]
2676#[target_feature(enable = "avx")]
2677#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2678#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2679// This intrinsic has no corresponding instruction.
2680#[stable(feature = "simd_x86", since = "1.27.0")]
2681pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
2682    _mm256_setr_epi64x(a, a, a, a)
2683}
2684
2685/// Cast vector of type __m256d to type __m256.
2686///
2687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2688#[inline]
2689#[target_feature(enable = "avx")]
2690// This intrinsic is only used for compilation and does not generate any
2691// instructions, thus it has zero latency.
2692#[stable(feature = "simd_x86", since = "1.27.0")]
2693pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2694    unsafe { transmute(a) }
2695}
2696
2697/// Cast vector of type __m256 to type __m256d.
2698///
2699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2700#[inline]
2701#[target_feature(enable = "avx")]
2702// This intrinsic is only used for compilation and does not generate any
2703// instructions, thus it has zero latency.
2704#[stable(feature = "simd_x86", since = "1.27.0")]
2705pub fn _mm256_castps_pd(a: __m256) -> __m256d {
2706    unsafe { transmute(a) }
2707}
2708
2709/// Casts vector of type __m256 to type __m256i.
2710///
2711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2712#[inline]
2713#[target_feature(enable = "avx")]
2714// This intrinsic is only used for compilation and does not generate any
2715// instructions, thus it has zero latency.
2716#[stable(feature = "simd_x86", since = "1.27.0")]
2717pub fn _mm256_castps_si256(a: __m256) -> __m256i {
2718    unsafe { transmute(a) }
2719}
2720
2721/// Casts vector of type __m256i to type __m256.
2722///
2723/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2724#[inline]
2725#[target_feature(enable = "avx")]
2726// This intrinsic is only used for compilation and does not generate any
2727// instructions, thus it has zero latency.
2728#[stable(feature = "simd_x86", since = "1.27.0")]
2729pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2730    unsafe { transmute(a) }
2731}
2732
2733/// Casts vector of type __m256d to type __m256i.
2734///
2735/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2736#[inline]
2737#[target_feature(enable = "avx")]
2738// This intrinsic is only used for compilation and does not generate any
2739// instructions, thus it has zero latency.
2740#[stable(feature = "simd_x86", since = "1.27.0")]
2741pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2742    unsafe { transmute(a) }
2743}
2744
2745/// Casts vector of type __m256i to type __m256d.
2746///
2747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2748#[inline]
2749#[target_feature(enable = "avx")]
2750// This intrinsic is only used for compilation and does not generate any
2751// instructions, thus it has zero latency.
2752#[stable(feature = "simd_x86", since = "1.27.0")]
2753pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2754    unsafe { transmute(a) }
2755}
2756
2757/// Casts vector of type __m256 to type __m128.
2758///
2759/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2760#[inline]
2761#[target_feature(enable = "avx")]
2762// This intrinsic is only used for compilation and does not generate any
2763// instructions, thus it has zero latency.
2764#[stable(feature = "simd_x86", since = "1.27.0")]
2765pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2766    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
2767}
2768
2769/// Casts vector of type __m256d to type __m128d.
2770///
2771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2772#[inline]
2773#[target_feature(enable = "avx")]
2774// This intrinsic is only used for compilation and does not generate any
2775// instructions, thus it has zero latency.
2776#[stable(feature = "simd_x86", since = "1.27.0")]
2777pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2778    unsafe { simd_shuffle!(a, a, [0, 1]) }
2779}
2780
2781/// Casts vector of type __m256i to type __m128i.
2782///
2783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2784#[inline]
2785#[target_feature(enable = "avx")]
2786// This intrinsic is only used for compilation and does not generate any
2787// instructions, thus it has zero latency.
2788#[stable(feature = "simd_x86", since = "1.27.0")]
2789pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2790    unsafe {
2791        let a = a.as_i64x4();
2792        let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2793        transmute(dst)
2794    }
2795}
2796
2797/// Casts vector of type __m128 to type __m256;
2798/// the upper 128 bits of the result are undefined.
2799///
2800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2801#[inline]
2802#[target_feature(enable = "avx")]
2803// This intrinsic is only used for compilation and does not generate any
2804// instructions, thus it has zero latency.
2805#[stable(feature = "simd_x86", since = "1.27.0")]
2806pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2807    unsafe { simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4]) }
2808}
2809
2810/// Casts vector of type __m128d to type __m256d;
2811/// the upper 128 bits of the result are undefined.
2812///
2813/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2814#[inline]
2815#[target_feature(enable = "avx")]
2816// This intrinsic is only used for compilation and does not generate any
2817// instructions, thus it has zero latency.
2818#[stable(feature = "simd_x86", since = "1.27.0")]
2819pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2820    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2]) }
2821}
2822
2823/// Casts vector of type __m128i to type __m256i;
2824/// the upper 128 bits of the result are undefined.
2825///
2826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2827#[inline]
2828#[target_feature(enable = "avx")]
2829// This intrinsic is only used for compilation and does not generate any
2830// instructions, thus it has zero latency.
2831#[stable(feature = "simd_x86", since = "1.27.0")]
2832pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2833    unsafe {
2834        let a = a.as_i64x2();
2835        let undefined = i64x2::ZERO;
2836        let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2837        transmute(dst)
2838    }
2839}
2840
2841/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2842/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2843/// the value of the source vector. The upper 128 bits are set to zero.
2844///
2845/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
2846#[inline]
2847#[target_feature(enable = "avx")]
2848// This intrinsic is only used for compilation and does not generate any
2849// instructions, thus it has zero latency.
2850#[stable(feature = "simd_x86", since = "1.27.0")]
2851pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
2852    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) }
2853}
2854
2855/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2856/// The lower 128 bits contain the value of the source vector. The upper
2857/// 128 bits are set to zero.
2858///
2859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
2860#[inline]
2861#[target_feature(enable = "avx")]
2862// This intrinsic is only used for compilation and does not generate any
2863// instructions, thus it has zero latency.
2864#[stable(feature = "simd_x86", since = "1.27.0")]
2865pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2866    unsafe {
2867        let b = i64x2::ZERO;
2868        let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
2869        transmute(dst)
2870    }
2871}
2872
2873/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2874/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
2875/// contain the value of the source vector. The upper 128 bits are set
2876/// to zero.
2877///
2878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
2879#[inline]
2880#[target_feature(enable = "avx")]
2881// This intrinsic is only used for compilation and does not generate any
2882// instructions, thus it has zero latency.
2883#[stable(feature = "simd_x86", since = "1.27.0")]
2884pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
2885    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) }
2886}
2887
2888/// Returns vector of type `__m256` with indeterminate elements.
2889/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2890/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2891/// In practice, this is typically equivalent to [`mem::zeroed`].
2892///
2893/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
2894#[inline]
2895#[target_feature(enable = "avx")]
2896// This intrinsic has no corresponding instruction.
2897#[stable(feature = "simd_x86", since = "1.27.0")]
2898pub fn _mm256_undefined_ps() -> __m256 {
2899    const { unsafe { mem::zeroed() } }
2900}
2901
2902/// Returns vector of type `__m256d` with indeterminate elements.
2903/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2904/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2905/// In practice, this is typically equivalent to [`mem::zeroed`].
2906///
2907/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
2908#[inline]
2909#[target_feature(enable = "avx")]
2910// This intrinsic has no corresponding instruction.
2911#[stable(feature = "simd_x86", since = "1.27.0")]
2912pub fn _mm256_undefined_pd() -> __m256d {
2913    const { unsafe { mem::zeroed() } }
2914}
2915
2916/// Returns vector of type __m256i with with indeterminate elements.
2917/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
2918/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
2919/// In practice, this is typically equivalent to [`mem::zeroed`].
2920///
2921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
2922#[inline]
2923#[target_feature(enable = "avx")]
2924// This intrinsic has no corresponding instruction.
2925#[stable(feature = "simd_x86", since = "1.27.0")]
2926pub fn _mm256_undefined_si256() -> __m256i {
2927    const { unsafe { mem::zeroed() } }
2928}
2929
2930/// Sets packed __m256 returned vector with the supplied values.
2931///
2932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
2933#[inline]
2934#[target_feature(enable = "avx")]
2935#[cfg_attr(test, assert_instr(vinsertf128))]
2936#[stable(feature = "simd_x86", since = "1.27.0")]
2937pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
2938    unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
2939}
2940
2941/// Sets packed __m256d returned vector with the supplied values.
2942///
2943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
2944#[inline]
2945#[target_feature(enable = "avx")]
2946#[cfg_attr(test, assert_instr(vinsertf128))]
2947#[stable(feature = "simd_x86", since = "1.27.0")]
2948pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
2949    unsafe {
2950        let hi: __m128 = transmute(hi);
2951        let lo: __m128 = transmute(lo);
2952        transmute(_mm256_set_m128(hi, lo))
2953    }
2954}
2955
2956/// Sets packed __m256i returned vector with the supplied values.
2957///
2958/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
2959#[inline]
2960#[target_feature(enable = "avx")]
2961#[cfg_attr(test, assert_instr(vinsertf128))]
2962#[stable(feature = "simd_x86", since = "1.27.0")]
2963pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
2964    unsafe {
2965        let hi: __m128 = transmute(hi);
2966        let lo: __m128 = transmute(lo);
2967        transmute(_mm256_set_m128(hi, lo))
2968    }
2969}
2970
2971/// Sets packed __m256 returned vector with the supplied values.
2972///
2973/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
2974#[inline]
2975#[target_feature(enable = "avx")]
2976#[cfg_attr(test, assert_instr(vinsertf128))]
2977#[stable(feature = "simd_x86", since = "1.27.0")]
2978pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2979    _mm256_set_m128(hi, lo)
2980}
2981
2982/// Sets packed __m256d returned vector with the supplied values.
2983///
2984/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
2985#[inline]
2986#[target_feature(enable = "avx")]
2987#[cfg_attr(test, assert_instr(vinsertf128))]
2988#[stable(feature = "simd_x86", since = "1.27.0")]
2989pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2990    _mm256_set_m128d(hi, lo)
2991}
2992
2993/// Sets packed __m256i returned vector with the supplied values.
2994///
2995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
2996#[inline]
2997#[target_feature(enable = "avx")]
2998#[cfg_attr(test, assert_instr(vinsertf128))]
2999#[stable(feature = "simd_x86", since = "1.27.0")]
3000pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
3001    _mm256_set_m128i(hi, lo)
3002}
3003
3004/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
3005/// floating-point elements) from memory, and combine them into a 256-bit
3006/// value.
3007/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3008///
3009/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
3010#[inline]
3011#[target_feature(enable = "avx")]
3012// This intrinsic has no corresponding instruction.
3013#[stable(feature = "simd_x86", since = "1.27.0")]
3014pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
3015    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
3016    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
3017}
3018
3019/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
3020/// floating-point elements) from memory, and combine them into a 256-bit
3021/// value.
3022/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3023///
3024/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
3025#[inline]
3026#[target_feature(enable = "avx")]
3027// This intrinsic has no corresponding instruction.
3028#[stable(feature = "simd_x86", since = "1.27.0")]
3029pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
3030    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
3031    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
3032}
3033
3034/// Loads two 128-bit values (composed of integer data) from memory, and combine
3035/// them into a 256-bit value.
3036/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3037///
3038/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
3039#[inline]
3040#[target_feature(enable = "avx")]
3041// This intrinsic has no corresponding instruction.
3042#[stable(feature = "simd_x86", since = "1.27.0")]
3043pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
3044    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
3045    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
3046}
3047
3048/// Stores the high and low 128-bit halves (each composed of 4 packed
3049/// single-precision (32-bit) floating-point elements) from `a` into memory two
3050/// different 128-bit locations.
3051/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3052///
3053/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
3054#[inline]
3055#[target_feature(enable = "avx")]
3056// This intrinsic has no corresponding instruction.
3057#[stable(feature = "simd_x86", since = "1.27.0")]
3058pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
3059    let lo = _mm256_castps256_ps128(a);
3060    _mm_storeu_ps(loaddr, lo);
3061    let hi = _mm256_extractf128_ps::<1>(a);
3062    _mm_storeu_ps(hiaddr, hi);
3063}
3064
3065/// Stores the high and low 128-bit halves (each composed of 2 packed
3066/// double-precision (64-bit) floating-point elements) from `a` into memory two
3067/// different 128-bit locations.
3068/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3069///
3070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
3071#[inline]
3072#[target_feature(enable = "avx")]
3073// This intrinsic has no corresponding instruction.
3074#[stable(feature = "simd_x86", since = "1.27.0")]
3075pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
3076    let lo = _mm256_castpd256_pd128(a);
3077    _mm_storeu_pd(loaddr, lo);
3078    let hi = _mm256_extractf128_pd::<1>(a);
3079    _mm_storeu_pd(hiaddr, hi);
3080}
3081
3082/// Stores the high and low 128-bit halves (each composed of integer data) from
3083/// `a` into memory two different 128-bit locations.
3084/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
3085///
3086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
3087#[inline]
3088#[target_feature(enable = "avx")]
3089// This intrinsic has no corresponding instruction.
3090#[stable(feature = "simd_x86", since = "1.27.0")]
3091pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
3092    let lo = _mm256_castsi256_si128(a);
3093    _mm_storeu_si128(loaddr, lo);
3094    let hi = _mm256_extractf128_si256::<1>(a);
3095    _mm_storeu_si128(hiaddr, hi);
3096}
3097
3098/// Returns the first element of the input vector of `[8 x float]`.
3099///
3100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
3101#[inline]
3102#[target_feature(enable = "avx")]
3103//#[cfg_attr(test, assert_instr(movss))] FIXME
3104#[stable(feature = "simd_x86", since = "1.27.0")]
3105pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
3106    unsafe { simd_extract!(a, 0) }
3107}
3108
3109// LLVM intrinsics used in the above functions
3110#[allow(improper_ctypes)]
3111unsafe extern "C" {
3112    #[link_name = "llvm.x86.avx.round.pd.256"]
3113    fn roundpd256(a: __m256d, b: i32) -> __m256d;
3114    #[link_name = "llvm.x86.avx.round.ps.256"]
3115    fn roundps256(a: __m256, b: i32) -> __m256;
3116    #[link_name = "llvm.x86.avx.dp.ps.256"]
3117    fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3118    #[link_name = "llvm.x86.sse2.cmp.pd"]
3119    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3120    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3121    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3122    #[link_name = "llvm.x86.sse.cmp.ps"]
3123    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3124    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3125    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3126    #[link_name = "llvm.x86.sse2.cmp.sd"]
3127    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3128    #[link_name = "llvm.x86.sse.cmp.ss"]
3129    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3130    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3131    fn vcvtps2dq(a: __m256) -> i32x8;
3132    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3133    fn vcvttpd2dq(a: __m256d) -> i32x4;
3134    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3135    fn vcvtpd2dq(a: __m256d) -> i32x4;
3136    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3137    fn vcvttps2dq(a: __m256) -> i32x8;
3138    #[link_name = "llvm.x86.avx.vzeroall"]
3139    fn vzeroall();
3140    #[link_name = "llvm.x86.avx.vzeroupper"]
3141    fn vzeroupper();
3142    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3143    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3144    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3145    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3146    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3147    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3148    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3149    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3150    #[link_name = "llvm.x86.avx.maskload.pd.256"]
3151    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
3152    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
3153    fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
3154    #[link_name = "llvm.x86.avx.maskload.pd"]
3155    fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
3156    #[link_name = "llvm.x86.avx.maskstore.pd"]
3157    fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
3158    #[link_name = "llvm.x86.avx.maskload.ps.256"]
3159    fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
3160    #[link_name = "llvm.x86.avx.maskstore.ps.256"]
3161    fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
3162    #[link_name = "llvm.x86.avx.maskload.ps"]
3163    fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
3164    #[link_name = "llvm.x86.avx.maskstore.ps"]
3165    fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
3166    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3167    fn vlddqu(mem_addr: *const i8) -> i8x32;
3168    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3169    fn vrcpps(a: __m256) -> __m256;
3170    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3171    fn vrsqrtps(a: __m256) -> __m256;
3172    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3173    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3174    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3175    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3176    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3177    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3178    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3179    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3180    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3181    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3182    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3183    fn vtestzps256(a: __m256, b: __m256) -> i32;
3184    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3185    fn vtestcps256(a: __m256, b: __m256) -> i32;
3186    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3187    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3188    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3189    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3190    #[link_name = "llvm.x86.avx.min.ps.256"]
3191    fn vminps(a: __m256, b: __m256) -> __m256;
3192    #[link_name = "llvm.x86.avx.max.ps.256"]
3193    fn vmaxps(a: __m256, b: __m256) -> __m256;
3194    #[link_name = "llvm.x86.avx.min.pd.256"]
3195    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3196    #[link_name = "llvm.x86.avx.max.pd.256"]
3197    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3198}
3199
3200#[cfg(test)]
3201mod tests {
3202    use crate::hint::black_box;
3203    use crate::ptr;
3204    use stdarch_test::simd_test;
3205
3206    use crate::core_arch::x86::*;
3207
3208    #[simd_test(enable = "avx")]
3209    unsafe fn test_mm256_add_pd() {
3210        let a = _mm256_setr_pd(1., 2., 3., 4.);
3211        let b = _mm256_setr_pd(5., 6., 7., 8.);
3212        let r = _mm256_add_pd(a, b);
3213        let e = _mm256_setr_pd(6., 8., 10., 12.);
3214        assert_eq_m256d(r, e);
3215    }
3216
3217    #[simd_test(enable = "avx")]
3218    unsafe fn test_mm256_add_ps() {
3219        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3220        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3221        let r = _mm256_add_ps(a, b);
3222        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3223        assert_eq_m256(r, e);
3224    }
3225
3226    #[simd_test(enable = "avx")]
3227    unsafe fn test_mm256_and_pd() {
3228        let a = _mm256_set1_pd(1.);
3229        let b = _mm256_set1_pd(0.6);
3230        let r = _mm256_and_pd(a, b);
3231        let e = _mm256_set1_pd(0.5);
3232        assert_eq_m256d(r, e);
3233    }
3234
3235    #[simd_test(enable = "avx")]
3236    unsafe fn test_mm256_and_ps() {
3237        let a = _mm256_set1_ps(1.);
3238        let b = _mm256_set1_ps(0.6);
3239        let r = _mm256_and_ps(a, b);
3240        let e = _mm256_set1_ps(0.5);
3241        assert_eq_m256(r, e);
3242    }
3243
3244    #[simd_test(enable = "avx")]
3245    unsafe fn test_mm256_or_pd() {
3246        let a = _mm256_set1_pd(1.);
3247        let b = _mm256_set1_pd(0.6);
3248        let r = _mm256_or_pd(a, b);
3249        let e = _mm256_set1_pd(1.2);
3250        assert_eq_m256d(r, e);
3251    }
3252
3253    #[simd_test(enable = "avx")]
3254    unsafe fn test_mm256_or_ps() {
3255        let a = _mm256_set1_ps(1.);
3256        let b = _mm256_set1_ps(0.6);
3257        let r = _mm256_or_ps(a, b);
3258        let e = _mm256_set1_ps(1.2);
3259        assert_eq_m256(r, e);
3260    }
3261
3262    #[simd_test(enable = "avx")]
3263    unsafe fn test_mm256_shuffle_pd() {
3264        let a = _mm256_setr_pd(1., 4., 5., 8.);
3265        let b = _mm256_setr_pd(2., 3., 6., 7.);
3266        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3267        let e = _mm256_setr_pd(4., 3., 8., 7.);
3268        assert_eq_m256d(r, e);
3269    }
3270
3271    #[simd_test(enable = "avx")]
3272    unsafe fn test_mm256_shuffle_ps() {
3273        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3274        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3275        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3276        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3277        assert_eq_m256(r, e);
3278    }
3279
3280    #[simd_test(enable = "avx")]
3281    unsafe fn test_mm256_andnot_pd() {
3282        let a = _mm256_set1_pd(0.);
3283        let b = _mm256_set1_pd(0.6);
3284        let r = _mm256_andnot_pd(a, b);
3285        assert_eq_m256d(r, b);
3286    }
3287
3288    #[simd_test(enable = "avx")]
3289    unsafe fn test_mm256_andnot_ps() {
3290        let a = _mm256_set1_ps(0.);
3291        let b = _mm256_set1_ps(0.6);
3292        let r = _mm256_andnot_ps(a, b);
3293        assert_eq_m256(r, b);
3294    }
3295
3296    #[simd_test(enable = "avx")]
3297    unsafe fn test_mm256_max_pd() {
3298        let a = _mm256_setr_pd(1., 4., 5., 8.);
3299        let b = _mm256_setr_pd(2., 3., 6., 7.);
3300        let r = _mm256_max_pd(a, b);
3301        let e = _mm256_setr_pd(2., 4., 6., 8.);
3302        assert_eq_m256d(r, e);
3303        // > If the values being compared are both 0.0s (of either sign), the
3304        // > value in the second operand (source operand) is returned.
3305        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3306        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3307        let wu: [u64; 4] = transmute(w);
3308        let xu: [u64; 4] = transmute(x);
3309        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3310        assert_eq!(xu, [0u64; 4]);
3311        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3312        // > second operand (source operand), either a NaN or a valid
3313        // > floating-point value, is written to the result.
3314        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3315        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3316        let yf: [f64; 4] = transmute(y);
3317        let zf: [f64; 4] = transmute(z);
3318        assert_eq!(yf, [0.0; 4]);
3319        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3320    }
3321
3322    #[simd_test(enable = "avx")]
3323    unsafe fn test_mm256_max_ps() {
3324        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3325        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3326        let r = _mm256_max_ps(a, b);
3327        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3328        assert_eq_m256(r, e);
3329        // > If the values being compared are both 0.0s (of either sign), the
3330        // > value in the second operand (source operand) is returned.
3331        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3332        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3333        let wu: [u32; 8] = transmute(w);
3334        let xu: [u32; 8] = transmute(x);
3335        assert_eq!(wu, [0x8000_0000u32; 8]);
3336        assert_eq!(xu, [0u32; 8]);
3337        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3338        // > second operand (source operand), either a NaN or a valid
3339        // > floating-point value, is written to the result.
3340        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3341        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3342        let yf: [f32; 8] = transmute(y);
3343        let zf: [f32; 8] = transmute(z);
3344        assert_eq!(yf, [0.0; 8]);
3345        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3346    }
3347
3348    #[simd_test(enable = "avx")]
3349    unsafe fn test_mm256_min_pd() {
3350        let a = _mm256_setr_pd(1., 4., 5., 8.);
3351        let b = _mm256_setr_pd(2., 3., 6., 7.);
3352        let r = _mm256_min_pd(a, b);
3353        let e = _mm256_setr_pd(1., 3., 5., 7.);
3354        assert_eq_m256d(r, e);
3355        // > If the values being compared are both 0.0s (of either sign), the
3356        // > value in the second operand (source operand) is returned.
3357        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3358        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3359        let wu: [u64; 4] = transmute(w);
3360        let xu: [u64; 4] = transmute(x);
3361        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3362        assert_eq!(xu, [0u64; 4]);
3363        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3364        // > second operand (source operand), either a NaN or a valid
3365        // > floating-point value, is written to the result.
3366        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3367        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3368        let yf: [f64; 4] = transmute(y);
3369        let zf: [f64; 4] = transmute(z);
3370        assert_eq!(yf, [0.0; 4]);
3371        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3372    }
3373
3374    #[simd_test(enable = "avx")]
3375    unsafe fn test_mm256_min_ps() {
3376        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3377        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3378        let r = _mm256_min_ps(a, b);
3379        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3380        assert_eq_m256(r, e);
3381        // > If the values being compared are both 0.0s (of either sign), the
3382        // > value in the second operand (source operand) is returned.
3383        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3384        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3385        let wu: [u32; 8] = transmute(w);
3386        let xu: [u32; 8] = transmute(x);
3387        assert_eq!(wu, [0x8000_0000u32; 8]);
3388        assert_eq!(xu, [0u32; 8]);
3389        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3390        // > second operand (source operand), either a NaN or a valid
3391        // > floating-point value, is written to the result.
3392        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3393        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3394        let yf: [f32; 8] = transmute(y);
3395        let zf: [f32; 8] = transmute(z);
3396        assert_eq!(yf, [0.0; 8]);
3397        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3398    }
3399
3400    #[simd_test(enable = "avx")]
3401    unsafe fn test_mm256_mul_pd() {
3402        let a = _mm256_setr_pd(1., 2., 3., 4.);
3403        let b = _mm256_setr_pd(5., 6., 7., 8.);
3404        let r = _mm256_mul_pd(a, b);
3405        let e = _mm256_setr_pd(5., 12., 21., 32.);
3406        assert_eq_m256d(r, e);
3407    }
3408
3409    #[simd_test(enable = "avx")]
3410    unsafe fn test_mm256_mul_ps() {
3411        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3412        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3413        let r = _mm256_mul_ps(a, b);
3414        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3415        assert_eq_m256(r, e);
3416    }
3417
3418    #[simd_test(enable = "avx")]
3419    unsafe fn test_mm256_addsub_pd() {
3420        let a = _mm256_setr_pd(1., 2., 3., 4.);
3421        let b = _mm256_setr_pd(5., 6., 7., 8.);
3422        let r = _mm256_addsub_pd(a, b);
3423        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3424        assert_eq_m256d(r, e);
3425    }
3426
3427    #[simd_test(enable = "avx")]
3428    unsafe fn test_mm256_addsub_ps() {
3429        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3430        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3431        let r = _mm256_addsub_ps(a, b);
3432        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3433        assert_eq_m256(r, e);
3434    }
3435
3436    #[simd_test(enable = "avx")]
3437    unsafe fn test_mm256_sub_pd() {
3438        let a = _mm256_setr_pd(1., 2., 3., 4.);
3439        let b = _mm256_setr_pd(5., 6., 7., 8.);
3440        let r = _mm256_sub_pd(a, b);
3441        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3442        assert_eq_m256d(r, e);
3443    }
3444
3445    #[simd_test(enable = "avx")]
3446    unsafe fn test_mm256_sub_ps() {
3447        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3448        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3449        let r = _mm256_sub_ps(a, b);
3450        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3451        assert_eq_m256(r, e);
3452    }
3453
3454    #[simd_test(enable = "avx")]
3455    unsafe fn test_mm256_round_pd() {
3456        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3457        let result_closest = _mm256_round_pd::<0b0000>(a);
3458        let result_down = _mm256_round_pd::<0b0001>(a);
3459        let result_up = _mm256_round_pd::<0b0010>(a);
3460        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3461        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3462        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3463        assert_eq_m256d(result_closest, expected_closest);
3464        assert_eq_m256d(result_down, expected_down);
3465        assert_eq_m256d(result_up, expected_up);
3466    }
3467
3468    #[simd_test(enable = "avx")]
3469    unsafe fn test_mm256_floor_pd() {
3470        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3471        let result_down = _mm256_floor_pd(a);
3472        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3473        assert_eq_m256d(result_down, expected_down);
3474    }
3475
3476    #[simd_test(enable = "avx")]
3477    unsafe fn test_mm256_ceil_pd() {
3478        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3479        let result_up = _mm256_ceil_pd(a);
3480        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3481        assert_eq_m256d(result_up, expected_up);
3482    }
3483
3484    #[simd_test(enable = "avx")]
3485    unsafe fn test_mm256_round_ps() {
3486        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3487        let result_closest = _mm256_round_ps::<0b0000>(a);
3488        let result_down = _mm256_round_ps::<0b0001>(a);
3489        let result_up = _mm256_round_ps::<0b0010>(a);
3490        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3491        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3492        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3493        assert_eq_m256(result_closest, expected_closest);
3494        assert_eq_m256(result_down, expected_down);
3495        assert_eq_m256(result_up, expected_up);
3496    }
3497
3498    #[simd_test(enable = "avx")]
3499    unsafe fn test_mm256_floor_ps() {
3500        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3501        let result_down = _mm256_floor_ps(a);
3502        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3503        assert_eq_m256(result_down, expected_down);
3504    }
3505
3506    #[simd_test(enable = "avx")]
3507    unsafe fn test_mm256_ceil_ps() {
3508        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3509        let result_up = _mm256_ceil_ps(a);
3510        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3511        assert_eq_m256(result_up, expected_up);
3512    }
3513
3514    #[simd_test(enable = "avx")]
3515    unsafe fn test_mm256_sqrt_pd() {
3516        let a = _mm256_setr_pd(4., 9., 16., 25.);
3517        let r = _mm256_sqrt_pd(a);
3518        let e = _mm256_setr_pd(2., 3., 4., 5.);
3519        assert_eq_m256d(r, e);
3520    }
3521
3522    #[simd_test(enable = "avx")]
3523    unsafe fn test_mm256_sqrt_ps() {
3524        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3525        let r = _mm256_sqrt_ps(a);
3526        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3527        assert_eq_m256(r, e);
3528    }
3529
3530    #[simd_test(enable = "avx")]
3531    unsafe fn test_mm256_div_ps() {
3532        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3533        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3534        let r = _mm256_div_ps(a, b);
3535        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3536        assert_eq_m256(r, e);
3537    }
3538
3539    #[simd_test(enable = "avx")]
3540    unsafe fn test_mm256_div_pd() {
3541        let a = _mm256_setr_pd(4., 9., 16., 25.);
3542        let b = _mm256_setr_pd(4., 3., 2., 5.);
3543        let r = _mm256_div_pd(a, b);
3544        let e = _mm256_setr_pd(1., 3., 8., 5.);
3545        assert_eq_m256d(r, e);
3546    }
3547
3548    #[simd_test(enable = "avx")]
3549    unsafe fn test_mm256_blend_pd() {
3550        let a = _mm256_setr_pd(4., 9., 16., 25.);
3551        let b = _mm256_setr_pd(4., 3., 2., 5.);
3552        let r = _mm256_blend_pd::<0x0>(a, b);
3553        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3554        let r = _mm256_blend_pd::<0x3>(a, b);
3555        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3556        let r = _mm256_blend_pd::<0xF>(a, b);
3557        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3558    }
3559
3560    #[simd_test(enable = "avx")]
3561    unsafe fn test_mm256_blend_ps() {
3562        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3563        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3564        let r = _mm256_blend_ps::<0x0>(a, b);
3565        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3566        let r = _mm256_blend_ps::<0x3>(a, b);
3567        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3568        let r = _mm256_blend_ps::<0xF>(a, b);
3569        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3570    }
3571
3572    #[simd_test(enable = "avx")]
3573    unsafe fn test_mm256_blendv_pd() {
3574        let a = _mm256_setr_pd(4., 9., 16., 25.);
3575        let b = _mm256_setr_pd(4., 3., 2., 5.);
3576        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3577        let r = _mm256_blendv_pd(a, b, c);
3578        let e = _mm256_setr_pd(4., 9., 2., 5.);
3579        assert_eq_m256d(r, e);
3580    }
3581
3582    #[simd_test(enable = "avx")]
3583    unsafe fn test_mm256_blendv_ps() {
3584        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3585        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3586        #[rustfmt::skip]
3587        let c = _mm256_setr_ps(
3588            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3589        );
3590        let r = _mm256_blendv_ps(a, b, c);
3591        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3592        assert_eq_m256(r, e);
3593    }
3594
3595    #[simd_test(enable = "avx")]
3596    unsafe fn test_mm256_dp_ps() {
3597        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3598        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3599        let r = _mm256_dp_ps::<0xFF>(a, b);
3600        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3601        assert_eq_m256(r, e);
3602    }
3603
3604    #[simd_test(enable = "avx")]
3605    unsafe fn test_mm256_hadd_pd() {
3606        let a = _mm256_setr_pd(4., 9., 16., 25.);
3607        let b = _mm256_setr_pd(4., 3., 2., 5.);
3608        let r = _mm256_hadd_pd(a, b);
3609        let e = _mm256_setr_pd(13., 7., 41., 7.);
3610        assert_eq_m256d(r, e);
3611
3612        let a = _mm256_setr_pd(1., 2., 3., 4.);
3613        let b = _mm256_setr_pd(5., 6., 7., 8.);
3614        let r = _mm256_hadd_pd(a, b);
3615        let e = _mm256_setr_pd(3., 11., 7., 15.);
3616        assert_eq_m256d(r, e);
3617    }
3618
3619    #[simd_test(enable = "avx")]
3620    unsafe fn test_mm256_hadd_ps() {
3621        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3622        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3623        let r = _mm256_hadd_ps(a, b);
3624        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3625        assert_eq_m256(r, e);
3626
3627        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3628        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3629        let r = _mm256_hadd_ps(a, b);
3630        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3631        assert_eq_m256(r, e);
3632    }
3633
3634    #[simd_test(enable = "avx")]
3635    unsafe fn test_mm256_hsub_pd() {
3636        let a = _mm256_setr_pd(4., 9., 16., 25.);
3637        let b = _mm256_setr_pd(4., 3., 2., 5.);
3638        let r = _mm256_hsub_pd(a, b);
3639        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3640        assert_eq_m256d(r, e);
3641
3642        let a = _mm256_setr_pd(1., 2., 3., 4.);
3643        let b = _mm256_setr_pd(5., 6., 7., 8.);
3644        let r = _mm256_hsub_pd(a, b);
3645        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3646        assert_eq_m256d(r, e);
3647    }
3648
3649    #[simd_test(enable = "avx")]
3650    unsafe fn test_mm256_hsub_ps() {
3651        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3652        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3653        let r = _mm256_hsub_ps(a, b);
3654        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3655        assert_eq_m256(r, e);
3656
3657        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3658        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3659        let r = _mm256_hsub_ps(a, b);
3660        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3661        assert_eq_m256(r, e);
3662    }
3663
3664    #[simd_test(enable = "avx")]
3665    unsafe fn test_mm256_xor_pd() {
3666        let a = _mm256_setr_pd(4., 9., 16., 25.);
3667        let b = _mm256_set1_pd(0.);
3668        let r = _mm256_xor_pd(a, b);
3669        assert_eq_m256d(r, a);
3670    }
3671
3672    #[simd_test(enable = "avx")]
3673    unsafe fn test_mm256_xor_ps() {
3674        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3675        let b = _mm256_set1_ps(0.);
3676        let r = _mm256_xor_ps(a, b);
3677        assert_eq_m256(r, a);
3678    }
3679
3680    #[simd_test(enable = "avx")]
3681    unsafe fn test_mm_cmp_pd() {
3682        let a = _mm_setr_pd(4., 9.);
3683        let b = _mm_setr_pd(4., 3.);
3684        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3685        assert!(get_m128d(r, 0).is_nan());
3686        assert!(get_m128d(r, 1).is_nan());
3687    }
3688
3689    #[simd_test(enable = "avx")]
3690    unsafe fn test_mm256_cmp_pd() {
3691        let a = _mm256_setr_pd(1., 2., 3., 4.);
3692        let b = _mm256_setr_pd(5., 6., 7., 8.);
3693        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3694        let e = _mm256_set1_pd(0.);
3695        assert_eq_m256d(r, e);
3696    }
3697
3698    #[simd_test(enable = "avx")]
3699    unsafe fn test_mm_cmp_ps() {
3700        let a = _mm_setr_ps(4., 3., 2., 5.);
3701        let b = _mm_setr_ps(4., 9., 16., 25.);
3702        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3703        assert!(get_m128(r, 0).is_nan());
3704        assert_eq!(get_m128(r, 1), 0.);
3705        assert_eq!(get_m128(r, 2), 0.);
3706        assert_eq!(get_m128(r, 3), 0.);
3707    }
3708
3709    #[simd_test(enable = "avx")]
3710    unsafe fn test_mm256_cmp_ps() {
3711        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3712        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3713        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3714        let e = _mm256_set1_ps(0.);
3715        assert_eq_m256(r, e);
3716    }
3717
3718    #[simd_test(enable = "avx")]
3719    unsafe fn test_mm_cmp_sd() {
3720        let a = _mm_setr_pd(4., 9.);
3721        let b = _mm_setr_pd(4., 3.);
3722        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3723        assert!(get_m128d(r, 0).is_nan());
3724        assert_eq!(get_m128d(r, 1), 9.);
3725    }
3726
3727    #[simd_test(enable = "avx")]
3728    unsafe fn test_mm_cmp_ss() {
3729        let a = _mm_setr_ps(4., 3., 2., 5.);
3730        let b = _mm_setr_ps(4., 9., 16., 25.);
3731        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3732        assert!(get_m128(r, 0).is_nan());
3733        assert_eq!(get_m128(r, 1), 3.);
3734        assert_eq!(get_m128(r, 2), 2.);
3735        assert_eq!(get_m128(r, 3), 5.);
3736    }
3737
3738    #[simd_test(enable = "avx")]
3739    unsafe fn test_mm256_cvtepi32_pd() {
3740        let a = _mm_setr_epi32(4, 9, 16, 25);
3741        let r = _mm256_cvtepi32_pd(a);
3742        let e = _mm256_setr_pd(4., 9., 16., 25.);
3743        assert_eq_m256d(r, e);
3744    }
3745
3746    #[simd_test(enable = "avx")]
3747    unsafe fn test_mm256_cvtepi32_ps() {
3748        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3749        let r = _mm256_cvtepi32_ps(a);
3750        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3751        assert_eq_m256(r, e);
3752    }
3753
3754    #[simd_test(enable = "avx")]
3755    unsafe fn test_mm256_cvtpd_ps() {
3756        let a = _mm256_setr_pd(4., 9., 16., 25.);
3757        let r = _mm256_cvtpd_ps(a);
3758        let e = _mm_setr_ps(4., 9., 16., 25.);
3759        assert_eq_m128(r, e);
3760    }
3761
3762    #[simd_test(enable = "avx")]
3763    unsafe fn test_mm256_cvtps_epi32() {
3764        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3765        let r = _mm256_cvtps_epi32(a);
3766        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3767        assert_eq_m256i(r, e);
3768    }
3769
3770    #[simd_test(enable = "avx")]
3771    unsafe fn test_mm256_cvtps_pd() {
3772        let a = _mm_setr_ps(4., 9., 16., 25.);
3773        let r = _mm256_cvtps_pd(a);
3774        let e = _mm256_setr_pd(4., 9., 16., 25.);
3775        assert_eq_m256d(r, e);
3776    }
3777
3778    #[simd_test(enable = "avx")]
3779    unsafe fn test_mm256_cvtsd_f64() {
3780        let a = _mm256_setr_pd(1., 2., 3., 4.);
3781        let r = _mm256_cvtsd_f64(a);
3782        assert_eq!(r, 1.);
3783    }
3784
3785    #[simd_test(enable = "avx")]
3786    unsafe fn test_mm256_cvttpd_epi32() {
3787        let a = _mm256_setr_pd(4., 9., 16., 25.);
3788        let r = _mm256_cvttpd_epi32(a);
3789        let e = _mm_setr_epi32(4, 9, 16, 25);
3790        assert_eq_m128i(r, e);
3791    }
3792
3793    #[simd_test(enable = "avx")]
3794    unsafe fn test_mm256_cvtpd_epi32() {
3795        let a = _mm256_setr_pd(4., 9., 16., 25.);
3796        let r = _mm256_cvtpd_epi32(a);
3797        let e = _mm_setr_epi32(4, 9, 16, 25);
3798        assert_eq_m128i(r, e);
3799    }
3800
3801    #[simd_test(enable = "avx")]
3802    unsafe fn test_mm256_cvttps_epi32() {
3803        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3804        let r = _mm256_cvttps_epi32(a);
3805        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3806        assert_eq_m256i(r, e);
3807    }
3808
3809    #[simd_test(enable = "avx")]
3810    unsafe fn test_mm256_extractf128_ps() {
3811        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3812        let r = _mm256_extractf128_ps::<0>(a);
3813        let e = _mm_setr_ps(4., 3., 2., 5.);
3814        assert_eq_m128(r, e);
3815    }
3816
3817    #[simd_test(enable = "avx")]
3818    unsafe fn test_mm256_extractf128_pd() {
3819        let a = _mm256_setr_pd(4., 3., 2., 5.);
3820        let r = _mm256_extractf128_pd::<0>(a);
3821        let e = _mm_setr_pd(4., 3.);
3822        assert_eq_m128d(r, e);
3823    }
3824
3825    #[simd_test(enable = "avx")]
3826    unsafe fn test_mm256_extractf128_si256() {
3827        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3828        let r = _mm256_extractf128_si256::<0>(a);
3829        let e = _mm_setr_epi64x(4, 3);
3830        assert_eq_m128i(r, e);
3831    }
3832
3833    #[simd_test(enable = "avx")]
3834    unsafe fn test_mm256_extract_epi32() {
3835        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3836        let r1 = _mm256_extract_epi32::<0>(a);
3837        let r2 = _mm256_extract_epi32::<3>(a);
3838        assert_eq!(r1, -1);
3839        assert_eq!(r2, 3);
3840    }
3841
3842    #[simd_test(enable = "avx")]
3843    unsafe fn test_mm256_cvtsi256_si32() {
3844        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3845        let r = _mm256_cvtsi256_si32(a);
3846        assert_eq!(r, 1);
3847    }
3848
3849    #[simd_test(enable = "avx")]
3850    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3851    unsafe fn test_mm256_zeroall() {
3852        _mm256_zeroall();
3853    }
3854
3855    #[simd_test(enable = "avx")]
3856    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3857    unsafe fn test_mm256_zeroupper() {
3858        _mm256_zeroupper();
3859    }
3860
3861    #[simd_test(enable = "avx")]
3862    unsafe fn test_mm256_permutevar_ps() {
3863        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3864        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3865        let r = _mm256_permutevar_ps(a, b);
3866        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3867        assert_eq_m256(r, e);
3868    }
3869
3870    #[simd_test(enable = "avx")]
3871    unsafe fn test_mm_permutevar_ps() {
3872        let a = _mm_setr_ps(4., 3., 2., 5.);
3873        let b = _mm_setr_epi32(1, 2, 3, 4);
3874        let r = _mm_permutevar_ps(a, b);
3875        let e = _mm_setr_ps(3., 2., 5., 4.);
3876        assert_eq_m128(r, e);
3877    }
3878
3879    #[simd_test(enable = "avx")]
3880    unsafe fn test_mm256_permute_ps() {
3881        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3882        let r = _mm256_permute_ps::<0x1b>(a);
3883        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3884        assert_eq_m256(r, e);
3885    }
3886
3887    #[simd_test(enable = "avx")]
3888    unsafe fn test_mm_permute_ps() {
3889        let a = _mm_setr_ps(4., 3., 2., 5.);
3890        let r = _mm_permute_ps::<0x1b>(a);
3891        let e = _mm_setr_ps(5., 2., 3., 4.);
3892        assert_eq_m128(r, e);
3893    }
3894
3895    #[simd_test(enable = "avx")]
3896    unsafe fn test_mm256_permutevar_pd() {
3897        let a = _mm256_setr_pd(4., 3., 2., 5.);
3898        let b = _mm256_setr_epi64x(1, 2, 3, 4);
3899        let r = _mm256_permutevar_pd(a, b);
3900        let e = _mm256_setr_pd(4., 3., 5., 2.);
3901        assert_eq_m256d(r, e);
3902    }
3903
3904    #[simd_test(enable = "avx")]
3905    unsafe fn test_mm_permutevar_pd() {
3906        let a = _mm_setr_pd(4., 3.);
3907        let b = _mm_setr_epi64x(3, 0);
3908        let r = _mm_permutevar_pd(a, b);
3909        let e = _mm_setr_pd(3., 4.);
3910        assert_eq_m128d(r, e);
3911    }
3912
3913    #[simd_test(enable = "avx")]
3914    unsafe fn test_mm256_permute_pd() {
3915        let a = _mm256_setr_pd(4., 3., 2., 5.);
3916        let r = _mm256_permute_pd::<5>(a);
3917        let e = _mm256_setr_pd(3., 4., 5., 2.);
3918        assert_eq_m256d(r, e);
3919    }
3920
3921    #[simd_test(enable = "avx")]
3922    unsafe fn test_mm_permute_pd() {
3923        let a = _mm_setr_pd(4., 3.);
3924        let r = _mm_permute_pd::<1>(a);
3925        let e = _mm_setr_pd(3., 4.);
3926        assert_eq_m128d(r, e);
3927    }
3928
3929    #[simd_test(enable = "avx")]
3930    unsafe fn test_mm256_permute2f128_ps() {
3931        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3932        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3933        let r = _mm256_permute2f128_ps::<0x13>(a, b);
3934        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
3935        assert_eq_m256(r, e);
3936    }
3937
3938    #[simd_test(enable = "avx")]
3939    unsafe fn test_mm256_permute2f128_pd() {
3940        let a = _mm256_setr_pd(1., 2., 3., 4.);
3941        let b = _mm256_setr_pd(5., 6., 7., 8.);
3942        let r = _mm256_permute2f128_pd::<0x31>(a, b);
3943        let e = _mm256_setr_pd(3., 4., 7., 8.);
3944        assert_eq_m256d(r, e);
3945    }
3946
3947    #[simd_test(enable = "avx")]
3948    unsafe fn test_mm256_permute2f128_si256() {
3949        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
3950        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
3951        let r = _mm256_permute2f128_si256::<0x20>(a, b);
3952        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3953        assert_eq_m256i(r, e);
3954    }
3955
3956    #[simd_test(enable = "avx")]
3957    unsafe fn test_mm256_broadcast_ss() {
3958        let r = _mm256_broadcast_ss(&3.);
3959        let e = _mm256_set1_ps(3.);
3960        assert_eq_m256(r, e);
3961    }
3962
3963    #[simd_test(enable = "avx")]
3964    unsafe fn test_mm_broadcast_ss() {
3965        let r = _mm_broadcast_ss(&3.);
3966        let e = _mm_set1_ps(3.);
3967        assert_eq_m128(r, e);
3968    }
3969
3970    #[simd_test(enable = "avx")]
3971    unsafe fn test_mm256_broadcast_sd() {
3972        let r = _mm256_broadcast_sd(&3.);
3973        let e = _mm256_set1_pd(3.);
3974        assert_eq_m256d(r, e);
3975    }
3976
3977    #[simd_test(enable = "avx")]
3978    unsafe fn test_mm256_broadcast_ps() {
3979        let a = _mm_setr_ps(4., 3., 2., 5.);
3980        let r = _mm256_broadcast_ps(&a);
3981        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3982        assert_eq_m256(r, e);
3983    }
3984
3985    #[simd_test(enable = "avx")]
3986    unsafe fn test_mm256_broadcast_pd() {
3987        let a = _mm_setr_pd(4., 3.);
3988        let r = _mm256_broadcast_pd(&a);
3989        let e = _mm256_setr_pd(4., 3., 4., 3.);
3990        assert_eq_m256d(r, e);
3991    }
3992
3993    #[simd_test(enable = "avx")]
3994    unsafe fn test_mm256_insertf128_ps() {
3995        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3996        let b = _mm_setr_ps(4., 9., 16., 25.);
3997        let r = _mm256_insertf128_ps::<0>(a, b);
3998        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3999        assert_eq_m256(r, e);
4000    }
4001
4002    #[simd_test(enable = "avx")]
4003    unsafe fn test_mm256_insertf128_pd() {
4004        let a = _mm256_setr_pd(1., 2., 3., 4.);
4005        let b = _mm_setr_pd(5., 6.);
4006        let r = _mm256_insertf128_pd::<0>(a, b);
4007        let e = _mm256_setr_pd(5., 6., 3., 4.);
4008        assert_eq_m256d(r, e);
4009    }
4010
4011    #[simd_test(enable = "avx")]
4012    unsafe fn test_mm256_insertf128_si256() {
4013        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4014        let b = _mm_setr_epi64x(5, 6);
4015        let r = _mm256_insertf128_si256::<0>(a, b);
4016        let e = _mm256_setr_epi64x(5, 6, 3, 4);
4017        assert_eq_m256i(r, e);
4018    }
4019
4020    #[simd_test(enable = "avx")]
4021    unsafe fn test_mm256_insert_epi8() {
4022        #[rustfmt::skip]
4023        let a = _mm256_setr_epi8(
4024            1, 2, 3, 4, 5, 6, 7, 8,
4025            9, 10, 11, 12, 13, 14, 15, 16,
4026            17, 18, 19, 20, 21, 22, 23, 24,
4027            25, 26, 27, 28, 29, 30, 31, 32,
4028        );
4029        let r = _mm256_insert_epi8::<31>(a, 0);
4030        #[rustfmt::skip]
4031        let e = _mm256_setr_epi8(
4032            1, 2, 3, 4, 5, 6, 7, 8,
4033            9, 10, 11, 12, 13, 14, 15, 16,
4034            17, 18, 19, 20, 21, 22, 23, 24,
4035            25, 26, 27, 28, 29, 30, 31, 0,
4036        );
4037        assert_eq_m256i(r, e);
4038    }
4039
4040    #[simd_test(enable = "avx")]
4041    unsafe fn test_mm256_insert_epi16() {
4042        #[rustfmt::skip]
4043        let a = _mm256_setr_epi16(
4044            0, 1, 2, 3, 4, 5, 6, 7,
4045            8, 9, 10, 11, 12, 13, 14, 15,
4046        );
4047        let r = _mm256_insert_epi16::<15>(a, 0);
4048        #[rustfmt::skip]
4049        let e = _mm256_setr_epi16(
4050            0, 1, 2, 3, 4, 5, 6, 7,
4051            8, 9, 10, 11, 12, 13, 14, 0,
4052        );
4053        assert_eq_m256i(r, e);
4054    }
4055
4056    #[simd_test(enable = "avx")]
4057    unsafe fn test_mm256_insert_epi32() {
4058        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4059        let r = _mm256_insert_epi32::<7>(a, 0);
4060        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
4061        assert_eq_m256i(r, e);
4062    }
4063
4064    #[simd_test(enable = "avx")]
4065    unsafe fn test_mm256_load_pd() {
4066        let a = _mm256_setr_pd(1., 2., 3., 4.);
4067        let p = ptr::addr_of!(a) as *const f64;
4068        let r = _mm256_load_pd(p);
4069        let e = _mm256_setr_pd(1., 2., 3., 4.);
4070        assert_eq_m256d(r, e);
4071    }
4072
4073    #[simd_test(enable = "avx")]
4074    unsafe fn test_mm256_store_pd() {
4075        let a = _mm256_setr_pd(1., 2., 3., 4.);
4076        let mut r = _mm256_undefined_pd();
4077        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4078        assert_eq_m256d(r, a);
4079    }
4080
4081    #[simd_test(enable = "avx")]
4082    unsafe fn test_mm256_load_ps() {
4083        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4084        let p = ptr::addr_of!(a) as *const f32;
4085        let r = _mm256_load_ps(p);
4086        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4087        assert_eq_m256(r, e);
4088    }
4089
4090    #[simd_test(enable = "avx")]
4091    unsafe fn test_mm256_store_ps() {
4092        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4093        let mut r = _mm256_undefined_ps();
4094        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4095        assert_eq_m256(r, a);
4096    }
4097
4098    #[simd_test(enable = "avx")]
4099    unsafe fn test_mm256_loadu_pd() {
4100        let a = &[1.0f64, 2., 3., 4.];
4101        let p = a.as_ptr();
4102        let r = _mm256_loadu_pd(black_box(p));
4103        let e = _mm256_setr_pd(1., 2., 3., 4.);
4104        assert_eq_m256d(r, e);
4105    }
4106
4107    #[simd_test(enable = "avx")]
4108    unsafe fn test_mm256_storeu_pd() {
4109        let a = _mm256_set1_pd(9.);
4110        let mut r = _mm256_undefined_pd();
4111        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4112        assert_eq_m256d(r, a);
4113    }
4114
4115    #[simd_test(enable = "avx")]
4116    unsafe fn test_mm256_loadu_ps() {
4117        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4118        let p = a.as_ptr();
4119        let r = _mm256_loadu_ps(black_box(p));
4120        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4121        assert_eq_m256(r, e);
4122    }
4123
4124    #[simd_test(enable = "avx")]
4125    unsafe fn test_mm256_storeu_ps() {
4126        let a = _mm256_set1_ps(9.);
4127        let mut r = _mm256_undefined_ps();
4128        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4129        assert_eq_m256(r, a);
4130    }
4131
4132    #[simd_test(enable = "avx")]
4133    unsafe fn test_mm256_load_si256() {
4134        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4135        let p = ptr::addr_of!(a);
4136        let r = _mm256_load_si256(p);
4137        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4138        assert_eq_m256i(r, e);
4139    }
4140
4141    #[simd_test(enable = "avx")]
4142    unsafe fn test_mm256_store_si256() {
4143        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4144        let mut r = _mm256_undefined_si256();
4145        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4146        assert_eq_m256i(r, a);
4147    }
4148
4149    #[simd_test(enable = "avx")]
4150    unsafe fn test_mm256_loadu_si256() {
4151        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4152        let p = ptr::addr_of!(a);
4153        let r = _mm256_loadu_si256(black_box(p));
4154        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4155        assert_eq_m256i(r, e);
4156    }
4157
4158    #[simd_test(enable = "avx")]
4159    unsafe fn test_mm256_storeu_si256() {
4160        let a = _mm256_set1_epi8(9);
4161        let mut r = _mm256_undefined_si256();
4162        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4163        assert_eq_m256i(r, a);
4164    }
4165
4166    #[simd_test(enable = "avx")]
4167    unsafe fn test_mm256_maskload_pd() {
4168        let a = &[1.0f64, 2., 3., 4.];
4169        let p = a.as_ptr();
4170        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4171        let r = _mm256_maskload_pd(black_box(p), mask);
4172        let e = _mm256_setr_pd(0., 2., 0., 4.);
4173        assert_eq_m256d(r, e);
4174    }
4175
4176    #[simd_test(enable = "avx")]
4177    unsafe fn test_mm256_maskstore_pd() {
4178        let mut r = _mm256_set1_pd(0.);
4179        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4180        let a = _mm256_setr_pd(1., 2., 3., 4.);
4181        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4182        let e = _mm256_setr_pd(0., 2., 0., 4.);
4183        assert_eq_m256d(r, e);
4184    }
4185
4186    #[simd_test(enable = "avx")]
4187    unsafe fn test_mm_maskload_pd() {
4188        let a = &[1.0f64, 2.];
4189        let p = a.as_ptr();
4190        let mask = _mm_setr_epi64x(0, !0);
4191        let r = _mm_maskload_pd(black_box(p), mask);
4192        let e = _mm_setr_pd(0., 2.);
4193        assert_eq_m128d(r, e);
4194    }
4195
4196    #[simd_test(enable = "avx")]
4197    unsafe fn test_mm_maskstore_pd() {
4198        let mut r = _mm_set1_pd(0.);
4199        let mask = _mm_setr_epi64x(0, !0);
4200        let a = _mm_setr_pd(1., 2.);
4201        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4202        let e = _mm_setr_pd(0., 2.);
4203        assert_eq_m128d(r, e);
4204    }
4205
4206    #[simd_test(enable = "avx")]
4207    unsafe fn test_mm256_maskload_ps() {
4208        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4209        let p = a.as_ptr();
4210        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4211        let r = _mm256_maskload_ps(black_box(p), mask);
4212        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4213        assert_eq_m256(r, e);
4214    }
4215
4216    #[simd_test(enable = "avx")]
4217    unsafe fn test_mm256_maskstore_ps() {
4218        let mut r = _mm256_set1_ps(0.);
4219        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4220        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4221        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4222        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4223        assert_eq_m256(r, e);
4224    }
4225
4226    #[simd_test(enable = "avx")]
4227    unsafe fn test_mm_maskload_ps() {
4228        let a = &[1.0f32, 2., 3., 4.];
4229        let p = a.as_ptr();
4230        let mask = _mm_setr_epi32(0, !0, 0, !0);
4231        let r = _mm_maskload_ps(black_box(p), mask);
4232        let e = _mm_setr_ps(0., 2., 0., 4.);
4233        assert_eq_m128(r, e);
4234    }
4235
4236    #[simd_test(enable = "avx")]
4237    unsafe fn test_mm_maskstore_ps() {
4238        let mut r = _mm_set1_ps(0.);
4239        let mask = _mm_setr_epi32(0, !0, 0, !0);
4240        let a = _mm_setr_ps(1., 2., 3., 4.);
4241        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4242        let e = _mm_setr_ps(0., 2., 0., 4.);
4243        assert_eq_m128(r, e);
4244    }
4245
4246    #[simd_test(enable = "avx")]
4247    unsafe fn test_mm256_movehdup_ps() {
4248        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4249        let r = _mm256_movehdup_ps(a);
4250        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4251        assert_eq_m256(r, e);
4252    }
4253
4254    #[simd_test(enable = "avx")]
4255    unsafe fn test_mm256_moveldup_ps() {
4256        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4257        let r = _mm256_moveldup_ps(a);
4258        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4259        assert_eq_m256(r, e);
4260    }
4261
4262    #[simd_test(enable = "avx")]
4263    unsafe fn test_mm256_movedup_pd() {
4264        let a = _mm256_setr_pd(1., 2., 3., 4.);
4265        let r = _mm256_movedup_pd(a);
4266        let e = _mm256_setr_pd(1., 1., 3., 3.);
4267        assert_eq_m256d(r, e);
4268    }
4269
4270    #[simd_test(enable = "avx")]
4271    unsafe fn test_mm256_lddqu_si256() {
4272        #[rustfmt::skip]
4273        let a = _mm256_setr_epi8(
4274            1, 2, 3, 4, 5, 6, 7, 8,
4275            9, 10, 11, 12, 13, 14, 15, 16,
4276            17, 18, 19, 20, 21, 22, 23, 24,
4277            25, 26, 27, 28, 29, 30, 31, 32,
4278        );
4279        let p = ptr::addr_of!(a);
4280        let r = _mm256_lddqu_si256(black_box(p));
4281        #[rustfmt::skip]
4282        let e = _mm256_setr_epi8(
4283            1, 2, 3, 4, 5, 6, 7, 8,
4284            9, 10, 11, 12, 13, 14, 15, 16,
4285            17, 18, 19, 20, 21, 22, 23, 24,
4286            25, 26, 27, 28, 29, 30, 31, 32,
4287        );
4288        assert_eq_m256i(r, e);
4289    }
4290
4291    #[simd_test(enable = "avx")]
4292    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4293    unsafe fn test_mm256_stream_si256() {
4294        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4295        let mut r = _mm256_undefined_si256();
4296        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4297        _mm_sfence();
4298        assert_eq_m256i(r, a);
4299    }
4300
4301    #[simd_test(enable = "avx")]
4302    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4303    unsafe fn test_mm256_stream_pd() {
4304        #[repr(align(32))]
4305        struct Memory {
4306            pub data: [f64; 4],
4307        }
4308        let a = _mm256_set1_pd(7.0);
4309        let mut mem = Memory { data: [-1.0; 4] };
4310
4311        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4312        _mm_sfence();
4313        for i in 0..4 {
4314            assert_eq!(mem.data[i], get_m256d(a, i));
4315        }
4316    }
4317
4318    #[simd_test(enable = "avx")]
4319    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4320    unsafe fn test_mm256_stream_ps() {
4321        #[repr(align(32))]
4322        struct Memory {
4323            pub data: [f32; 8],
4324        }
4325        let a = _mm256_set1_ps(7.0);
4326        let mut mem = Memory { data: [-1.0; 8] };
4327
4328        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4329        _mm_sfence();
4330        for i in 0..8 {
4331            assert_eq!(mem.data[i], get_m256(a, i));
4332        }
4333    }
4334
4335    #[simd_test(enable = "avx")]
4336    unsafe fn test_mm256_rcp_ps() {
4337        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4338        let r = _mm256_rcp_ps(a);
4339        #[rustfmt::skip]
4340        let e = _mm256_setr_ps(
4341            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4342            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4343        );
4344        let rel_err = 0.00048828125;
4345        for i in 0..8 {
4346            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4347        }
4348    }
4349
4350    #[simd_test(enable = "avx")]
4351    unsafe fn test_mm256_rsqrt_ps() {
4352        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4353        let r = _mm256_rsqrt_ps(a);
4354        #[rustfmt::skip]
4355        let e = _mm256_setr_ps(
4356            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4357            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4358        );
4359        let rel_err = 0.00048828125;
4360        for i in 0..8 {
4361            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4362        }
4363    }
4364
4365    #[simd_test(enable = "avx")]
4366    unsafe fn test_mm256_unpackhi_pd() {
4367        let a = _mm256_setr_pd(1., 2., 3., 4.);
4368        let b = _mm256_setr_pd(5., 6., 7., 8.);
4369        let r = _mm256_unpackhi_pd(a, b);
4370        let e = _mm256_setr_pd(2., 6., 4., 8.);
4371        assert_eq_m256d(r, e);
4372    }
4373
4374    #[simd_test(enable = "avx")]
4375    unsafe fn test_mm256_unpackhi_ps() {
4376        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4377        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4378        let r = _mm256_unpackhi_ps(a, b);
4379        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4380        assert_eq_m256(r, e);
4381    }
4382
4383    #[simd_test(enable = "avx")]
4384    unsafe fn test_mm256_unpacklo_pd() {
4385        let a = _mm256_setr_pd(1., 2., 3., 4.);
4386        let b = _mm256_setr_pd(5., 6., 7., 8.);
4387        let r = _mm256_unpacklo_pd(a, b);
4388        let e = _mm256_setr_pd(1., 5., 3., 7.);
4389        assert_eq_m256d(r, e);
4390    }
4391
4392    #[simd_test(enable = "avx")]
4393    unsafe fn test_mm256_unpacklo_ps() {
4394        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4395        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4396        let r = _mm256_unpacklo_ps(a, b);
4397        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4398        assert_eq_m256(r, e);
4399    }
4400
4401    #[simd_test(enable = "avx")]
4402    unsafe fn test_mm256_testz_si256() {
4403        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4404        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4405        let r = _mm256_testz_si256(a, b);
4406        assert_eq!(r, 0);
4407        let b = _mm256_set1_epi64x(0);
4408        let r = _mm256_testz_si256(a, b);
4409        assert_eq!(r, 1);
4410    }
4411
4412    #[simd_test(enable = "avx")]
4413    unsafe fn test_mm256_testc_si256() {
4414        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4415        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4416        let r = _mm256_testc_si256(a, b);
4417        assert_eq!(r, 0);
4418        let b = _mm256_set1_epi64x(0);
4419        let r = _mm256_testc_si256(a, b);
4420        assert_eq!(r, 1);
4421    }
4422
4423    #[simd_test(enable = "avx")]
4424    unsafe fn test_mm256_testnzc_si256() {
4425        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4426        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4427        let r = _mm256_testnzc_si256(a, b);
4428        assert_eq!(r, 1);
4429        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4430        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4431        let r = _mm256_testnzc_si256(a, b);
4432        assert_eq!(r, 0);
4433    }
4434
4435    #[simd_test(enable = "avx")]
4436    unsafe fn test_mm256_testz_pd() {
4437        let a = _mm256_setr_pd(1., 2., 3., 4.);
4438        let b = _mm256_setr_pd(5., 6., 7., 8.);
4439        let r = _mm256_testz_pd(a, b);
4440        assert_eq!(r, 1);
4441        let a = _mm256_set1_pd(-1.);
4442        let r = _mm256_testz_pd(a, a);
4443        assert_eq!(r, 0);
4444    }
4445
4446    #[simd_test(enable = "avx")]
4447    unsafe fn test_mm256_testc_pd() {
4448        let a = _mm256_setr_pd(1., 2., 3., 4.);
4449        let b = _mm256_setr_pd(5., 6., 7., 8.);
4450        let r = _mm256_testc_pd(a, b);
4451        assert_eq!(r, 1);
4452        let a = _mm256_set1_pd(1.);
4453        let b = _mm256_set1_pd(-1.);
4454        let r = _mm256_testc_pd(a, b);
4455        assert_eq!(r, 0);
4456    }
4457
4458    #[simd_test(enable = "avx")]
4459    unsafe fn test_mm256_testnzc_pd() {
4460        let a = _mm256_setr_pd(1., 2., 3., 4.);
4461        let b = _mm256_setr_pd(5., 6., 7., 8.);
4462        let r = _mm256_testnzc_pd(a, b);
4463        assert_eq!(r, 0);
4464        let a = _mm256_setr_pd(1., -1., -1., -1.);
4465        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4466        let r = _mm256_testnzc_pd(a, b);
4467        assert_eq!(r, 1);
4468    }
4469
4470    #[simd_test(enable = "avx")]
4471    unsafe fn test_mm_testz_pd() {
4472        let a = _mm_setr_pd(1., 2.);
4473        let b = _mm_setr_pd(5., 6.);
4474        let r = _mm_testz_pd(a, b);
4475        assert_eq!(r, 1);
4476        let a = _mm_set1_pd(-1.);
4477        let r = _mm_testz_pd(a, a);
4478        assert_eq!(r, 0);
4479    }
4480
4481    #[simd_test(enable = "avx")]
4482    unsafe fn test_mm_testc_pd() {
4483        let a = _mm_setr_pd(1., 2.);
4484        let b = _mm_setr_pd(5., 6.);
4485        let r = _mm_testc_pd(a, b);
4486        assert_eq!(r, 1);
4487        let a = _mm_set1_pd(1.);
4488        let b = _mm_set1_pd(-1.);
4489        let r = _mm_testc_pd(a, b);
4490        assert_eq!(r, 0);
4491    }
4492
4493    #[simd_test(enable = "avx")]
4494    unsafe fn test_mm_testnzc_pd() {
4495        let a = _mm_setr_pd(1., 2.);
4496        let b = _mm_setr_pd(5., 6.);
4497        let r = _mm_testnzc_pd(a, b);
4498        assert_eq!(r, 0);
4499        let a = _mm_setr_pd(1., -1.);
4500        let b = _mm_setr_pd(-1., -1.);
4501        let r = _mm_testnzc_pd(a, b);
4502        assert_eq!(r, 1);
4503    }
4504
4505    #[simd_test(enable = "avx")]
4506    unsafe fn test_mm256_testz_ps() {
4507        let a = _mm256_set1_ps(1.);
4508        let r = _mm256_testz_ps(a, a);
4509        assert_eq!(r, 1);
4510        let a = _mm256_set1_ps(-1.);
4511        let r = _mm256_testz_ps(a, a);
4512        assert_eq!(r, 0);
4513    }
4514
4515    #[simd_test(enable = "avx")]
4516    unsafe fn test_mm256_testc_ps() {
4517        let a = _mm256_set1_ps(1.);
4518        let r = _mm256_testc_ps(a, a);
4519        assert_eq!(r, 1);
4520        let b = _mm256_set1_ps(-1.);
4521        let r = _mm256_testc_ps(a, b);
4522        assert_eq!(r, 0);
4523    }
4524
4525    #[simd_test(enable = "avx")]
4526    unsafe fn test_mm256_testnzc_ps() {
4527        let a = _mm256_set1_ps(1.);
4528        let r = _mm256_testnzc_ps(a, a);
4529        assert_eq!(r, 0);
4530        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4531        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4532        let r = _mm256_testnzc_ps(a, b);
4533        assert_eq!(r, 1);
4534    }
4535
4536    #[simd_test(enable = "avx")]
4537    unsafe fn test_mm_testz_ps() {
4538        let a = _mm_set1_ps(1.);
4539        let r = _mm_testz_ps(a, a);
4540        assert_eq!(r, 1);
4541        let a = _mm_set1_ps(-1.);
4542        let r = _mm_testz_ps(a, a);
4543        assert_eq!(r, 0);
4544    }
4545
4546    #[simd_test(enable = "avx")]
4547    unsafe fn test_mm_testc_ps() {
4548        let a = _mm_set1_ps(1.);
4549        let r = _mm_testc_ps(a, a);
4550        assert_eq!(r, 1);
4551        let b = _mm_set1_ps(-1.);
4552        let r = _mm_testc_ps(a, b);
4553        assert_eq!(r, 0);
4554    }
4555
4556    #[simd_test(enable = "avx")]
4557    unsafe fn test_mm_testnzc_ps() {
4558        let a = _mm_set1_ps(1.);
4559        let r = _mm_testnzc_ps(a, a);
4560        assert_eq!(r, 0);
4561        let a = _mm_setr_ps(1., -1., -1., -1.);
4562        let b = _mm_setr_ps(-1., -1., 1., 1.);
4563        let r = _mm_testnzc_ps(a, b);
4564        assert_eq!(r, 1);
4565    }
4566
4567    #[simd_test(enable = "avx")]
4568    unsafe fn test_mm256_movemask_pd() {
4569        let a = _mm256_setr_pd(1., -2., 3., -4.);
4570        let r = _mm256_movemask_pd(a);
4571        assert_eq!(r, 0xA);
4572    }
4573
4574    #[simd_test(enable = "avx")]
4575    unsafe fn test_mm256_movemask_ps() {
4576        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4577        let r = _mm256_movemask_ps(a);
4578        assert_eq!(r, 0xAA);
4579    }
4580
4581    #[simd_test(enable = "avx")]
4582    unsafe fn test_mm256_setzero_pd() {
4583        let r = _mm256_setzero_pd();
4584        assert_eq_m256d(r, _mm256_set1_pd(0.));
4585    }
4586
4587    #[simd_test(enable = "avx")]
4588    unsafe fn test_mm256_setzero_ps() {
4589        let r = _mm256_setzero_ps();
4590        assert_eq_m256(r, _mm256_set1_ps(0.));
4591    }
4592
4593    #[simd_test(enable = "avx")]
4594    unsafe fn test_mm256_setzero_si256() {
4595        let r = _mm256_setzero_si256();
4596        assert_eq_m256i(r, _mm256_set1_epi8(0));
4597    }
4598
4599    #[simd_test(enable = "avx")]
4600    unsafe fn test_mm256_set_pd() {
4601        let r = _mm256_set_pd(1., 2., 3., 4.);
4602        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4603    }
4604
4605    #[simd_test(enable = "avx")]
4606    unsafe fn test_mm256_set_ps() {
4607        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4608        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4609    }
4610
4611    #[simd_test(enable = "avx")]
4612    unsafe fn test_mm256_set_epi8() {
4613        #[rustfmt::skip]
4614        let r = _mm256_set_epi8(
4615            1, 2, 3, 4, 5, 6, 7, 8,
4616            9, 10, 11, 12, 13, 14, 15, 16,
4617            17, 18, 19, 20, 21, 22, 23, 24,
4618            25, 26, 27, 28, 29, 30, 31, 32,
4619        );
4620        #[rustfmt::skip]
4621        let e = _mm256_setr_epi8(
4622            32, 31, 30, 29, 28, 27, 26, 25,
4623            24, 23, 22, 21, 20, 19, 18, 17,
4624            16, 15, 14, 13, 12, 11, 10, 9,
4625            8, 7, 6, 5, 4, 3, 2, 1
4626        );
4627        assert_eq_m256i(r, e);
4628    }
4629
4630    #[simd_test(enable = "avx")]
4631    unsafe fn test_mm256_set_epi16() {
4632        #[rustfmt::skip]
4633        let r = _mm256_set_epi16(
4634            1, 2, 3, 4, 5, 6, 7, 8,
4635            9, 10, 11, 12, 13, 14, 15, 16,
4636        );
4637        #[rustfmt::skip]
4638        let e = _mm256_setr_epi16(
4639            16, 15, 14, 13, 12, 11, 10, 9, 8,
4640            7, 6, 5, 4, 3, 2, 1,
4641        );
4642        assert_eq_m256i(r, e);
4643    }
4644
4645    #[simd_test(enable = "avx")]
4646    unsafe fn test_mm256_set_epi32() {
4647        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4648        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4649    }
4650
4651    #[simd_test(enable = "avx")]
4652    unsafe fn test_mm256_set_epi64x() {
4653        let r = _mm256_set_epi64x(1, 2, 3, 4);
4654        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4655    }
4656
4657    #[simd_test(enable = "avx")]
4658    unsafe fn test_mm256_setr_pd() {
4659        let r = _mm256_setr_pd(1., 2., 3., 4.);
4660        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4661    }
4662
4663    #[simd_test(enable = "avx")]
4664    unsafe fn test_mm256_setr_ps() {
4665        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4666        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4667    }
4668
4669    #[simd_test(enable = "avx")]
4670    unsafe fn test_mm256_setr_epi8() {
4671        #[rustfmt::skip]
4672        let r = _mm256_setr_epi8(
4673            1, 2, 3, 4, 5, 6, 7, 8,
4674            9, 10, 11, 12, 13, 14, 15, 16,
4675            17, 18, 19, 20, 21, 22, 23, 24,
4676            25, 26, 27, 28, 29, 30, 31, 32,
4677        );
4678        #[rustfmt::skip]
4679        let e = _mm256_setr_epi8(
4680            1, 2, 3, 4, 5, 6, 7, 8,
4681            9, 10, 11, 12, 13, 14, 15, 16,
4682            17, 18, 19, 20, 21, 22, 23, 24,
4683            25, 26, 27, 28, 29, 30, 31, 32
4684        );
4685
4686        assert_eq_m256i(r, e);
4687    }
4688
4689    #[simd_test(enable = "avx")]
4690    unsafe fn test_mm256_setr_epi16() {
4691        #[rustfmt::skip]
4692        let r = _mm256_setr_epi16(
4693            1, 2, 3, 4, 5, 6, 7, 8,
4694            9, 10, 11, 12, 13, 14, 15, 16,
4695        );
4696        #[rustfmt::skip]
4697        let e = _mm256_setr_epi16(
4698            1, 2, 3, 4, 5, 6, 7, 8,
4699            9, 10, 11, 12, 13, 14, 15, 16,
4700        );
4701        assert_eq_m256i(r, e);
4702    }
4703
4704    #[simd_test(enable = "avx")]
4705    unsafe fn test_mm256_setr_epi32() {
4706        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4707        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4708    }
4709
4710    #[simd_test(enable = "avx")]
4711    unsafe fn test_mm256_setr_epi64x() {
4712        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4713        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4714    }
4715
4716    #[simd_test(enable = "avx")]
4717    unsafe fn test_mm256_set1_pd() {
4718        let r = _mm256_set1_pd(1.);
4719        assert_eq_m256d(r, _mm256_set1_pd(1.));
4720    }
4721
4722    #[simd_test(enable = "avx")]
4723    unsafe fn test_mm256_set1_ps() {
4724        let r = _mm256_set1_ps(1.);
4725        assert_eq_m256(r, _mm256_set1_ps(1.));
4726    }
4727
4728    #[simd_test(enable = "avx")]
4729    unsafe fn test_mm256_set1_epi8() {
4730        let r = _mm256_set1_epi8(1);
4731        assert_eq_m256i(r, _mm256_set1_epi8(1));
4732    }
4733
4734    #[simd_test(enable = "avx")]
4735    unsafe fn test_mm256_set1_epi16() {
4736        let r = _mm256_set1_epi16(1);
4737        assert_eq_m256i(r, _mm256_set1_epi16(1));
4738    }
4739
4740    #[simd_test(enable = "avx")]
4741    unsafe fn test_mm256_set1_epi32() {
4742        let r = _mm256_set1_epi32(1);
4743        assert_eq_m256i(r, _mm256_set1_epi32(1));
4744    }
4745
4746    #[simd_test(enable = "avx")]
4747    unsafe fn test_mm256_set1_epi64x() {
4748        let r = _mm256_set1_epi64x(1);
4749        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4750    }
4751
4752    #[simd_test(enable = "avx")]
4753    unsafe fn test_mm256_castpd_ps() {
4754        let a = _mm256_setr_pd(1., 2., 3., 4.);
4755        let r = _mm256_castpd_ps(a);
4756        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4757        assert_eq_m256(r, e);
4758    }
4759
4760    #[simd_test(enable = "avx")]
4761    unsafe fn test_mm256_castps_pd() {
4762        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4763        let r = _mm256_castps_pd(a);
4764        let e = _mm256_setr_pd(1., 2., 3., 4.);
4765        assert_eq_m256d(r, e);
4766    }
4767
4768    #[simd_test(enable = "avx")]
4769    unsafe fn test_mm256_castps_si256() {
4770        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4771        let r = _mm256_castps_si256(a);
4772        #[rustfmt::skip]
4773        let e = _mm256_setr_epi8(
4774            0, 0, -128, 63, 0, 0, 0, 64,
4775            0, 0, 64, 64, 0, 0, -128, 64,
4776            0, 0, -96, 64, 0, 0, -64, 64,
4777            0, 0, -32, 64, 0, 0, 0, 65,
4778        );
4779        assert_eq_m256i(r, e);
4780    }
4781
4782    #[simd_test(enable = "avx")]
4783    unsafe fn test_mm256_castsi256_ps() {
4784        #[rustfmt::skip]
4785        let a = _mm256_setr_epi8(
4786            0, 0, -128, 63, 0, 0, 0, 64,
4787            0, 0, 64, 64, 0, 0, -128, 64,
4788            0, 0, -96, 64, 0, 0, -64, 64,
4789            0, 0, -32, 64, 0, 0, 0, 65,
4790        );
4791        let r = _mm256_castsi256_ps(a);
4792        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4793        assert_eq_m256(r, e);
4794    }
4795
4796    #[simd_test(enable = "avx")]
4797    unsafe fn test_mm256_castpd_si256() {
4798        let a = _mm256_setr_pd(1., 2., 3., 4.);
4799        let r = _mm256_castpd_si256(a);
4800        assert_eq_m256d(transmute(r), a);
4801    }
4802
4803    #[simd_test(enable = "avx")]
4804    unsafe fn test_mm256_castsi256_pd() {
4805        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4806        let r = _mm256_castsi256_pd(a);
4807        assert_eq_m256d(r, transmute(a));
4808    }
4809
4810    #[simd_test(enable = "avx")]
4811    unsafe fn test_mm256_castps256_ps128() {
4812        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4813        let r = _mm256_castps256_ps128(a);
4814        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4815    }
4816
4817    #[simd_test(enable = "avx")]
4818    unsafe fn test_mm256_castpd256_pd128() {
4819        let a = _mm256_setr_pd(1., 2., 3., 4.);
4820        let r = _mm256_castpd256_pd128(a);
4821        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4822    }
4823
4824    #[simd_test(enable = "avx")]
4825    unsafe fn test_mm256_castsi256_si128() {
4826        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4827        let r = _mm256_castsi256_si128(a);
4828        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4829    }
4830
4831    #[simd_test(enable = "avx")]
4832    unsafe fn test_mm256_castps128_ps256() {
4833        let a = _mm_setr_ps(1., 2., 3., 4.);
4834        let r = _mm256_castps128_ps256(a);
4835        assert_eq_m128(_mm256_castps256_ps128(r), a);
4836    }
4837
4838    #[simd_test(enable = "avx")]
4839    unsafe fn test_mm256_castpd128_pd256() {
4840        let a = _mm_setr_pd(1., 2.);
4841        let r = _mm256_castpd128_pd256(a);
4842        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
4843    }
4844
4845    #[simd_test(enable = "avx")]
4846    unsafe fn test_mm256_castsi128_si256() {
4847        let a = _mm_setr_epi32(1, 2, 3, 4);
4848        let r = _mm256_castsi128_si256(a);
4849        assert_eq_m128i(_mm256_castsi256_si128(r), a);
4850    }
4851
4852    #[simd_test(enable = "avx")]
4853    unsafe fn test_mm256_zextps128_ps256() {
4854        let a = _mm_setr_ps(1., 2., 3., 4.);
4855        let r = _mm256_zextps128_ps256(a);
4856        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4857        assert_eq_m256(r, e);
4858    }
4859
4860    #[simd_test(enable = "avx")]
4861    unsafe fn test_mm256_zextsi128_si256() {
4862        let a = _mm_setr_epi64x(1, 2);
4863        let r = _mm256_zextsi128_si256(a);
4864        let e = _mm256_setr_epi64x(1, 2, 0, 0);
4865        assert_eq_m256i(r, e);
4866    }
4867
4868    #[simd_test(enable = "avx")]
4869    unsafe fn test_mm256_zextpd128_pd256() {
4870        let a = _mm_setr_pd(1., 2.);
4871        let r = _mm256_zextpd128_pd256(a);
4872        let e = _mm256_setr_pd(1., 2., 0., 0.);
4873        assert_eq_m256d(r, e);
4874    }
4875
4876    #[simd_test(enable = "avx")]
4877    unsafe fn test_mm256_set_m128() {
4878        let hi = _mm_setr_ps(5., 6., 7., 8.);
4879        let lo = _mm_setr_ps(1., 2., 3., 4.);
4880        let r = _mm256_set_m128(hi, lo);
4881        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4882        assert_eq_m256(r, e);
4883    }
4884
4885    #[simd_test(enable = "avx")]
4886    unsafe fn test_mm256_set_m128d() {
4887        let hi = _mm_setr_pd(3., 4.);
4888        let lo = _mm_setr_pd(1., 2.);
4889        let r = _mm256_set_m128d(hi, lo);
4890        let e = _mm256_setr_pd(1., 2., 3., 4.);
4891        assert_eq_m256d(r, e);
4892    }
4893
4894    #[simd_test(enable = "avx")]
4895    unsafe fn test_mm256_set_m128i() {
4896        #[rustfmt::skip]
4897        let hi = _mm_setr_epi8(
4898            17, 18, 19, 20,
4899            21, 22, 23, 24,
4900            25, 26, 27, 28,
4901            29, 30, 31, 32,
4902        );
4903        #[rustfmt::skip]
4904        let lo = _mm_setr_epi8(
4905            1, 2, 3, 4,
4906            5, 6, 7, 8,
4907            9, 10, 11, 12,
4908            13, 14, 15, 16,
4909        );
4910        let r = _mm256_set_m128i(hi, lo);
4911        #[rustfmt::skip]
4912        let e = _mm256_setr_epi8(
4913            1, 2, 3, 4, 5, 6, 7, 8,
4914            9, 10, 11, 12, 13, 14, 15, 16,
4915            17, 18, 19, 20, 21, 22, 23, 24,
4916            25, 26, 27, 28, 29, 30, 31, 32,
4917        );
4918        assert_eq_m256i(r, e);
4919    }
4920
4921    #[simd_test(enable = "avx")]
4922    unsafe fn test_mm256_setr_m128() {
4923        let lo = _mm_setr_ps(1., 2., 3., 4.);
4924        let hi = _mm_setr_ps(5., 6., 7., 8.);
4925        let r = _mm256_setr_m128(lo, hi);
4926        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4927        assert_eq_m256(r, e);
4928    }
4929
4930    #[simd_test(enable = "avx")]
4931    unsafe fn test_mm256_setr_m128d() {
4932        let lo = _mm_setr_pd(1., 2.);
4933        let hi = _mm_setr_pd(3., 4.);
4934        let r = _mm256_setr_m128d(lo, hi);
4935        let e = _mm256_setr_pd(1., 2., 3., 4.);
4936        assert_eq_m256d(r, e);
4937    }
4938
4939    #[simd_test(enable = "avx")]
4940    unsafe fn test_mm256_setr_m128i() {
4941        #[rustfmt::skip]
4942        let lo = _mm_setr_epi8(
4943            1, 2, 3, 4,
4944            5, 6, 7, 8,
4945            9, 10, 11, 12,
4946            13, 14, 15, 16,
4947        );
4948        #[rustfmt::skip]
4949        let hi = _mm_setr_epi8(
4950            17, 18, 19, 20, 21, 22, 23, 24,
4951            25, 26, 27, 28, 29, 30, 31, 32,
4952        );
4953        let r = _mm256_setr_m128i(lo, hi);
4954        #[rustfmt::skip]
4955        let e = _mm256_setr_epi8(
4956            1, 2, 3, 4, 5, 6, 7, 8,
4957            9, 10, 11, 12, 13, 14, 15, 16,
4958            17, 18, 19, 20, 21, 22, 23, 24,
4959            25, 26, 27, 28, 29, 30, 31, 32,
4960        );
4961        assert_eq_m256i(r, e);
4962    }
4963
4964    #[simd_test(enable = "avx")]
4965    unsafe fn test_mm256_loadu2_m128() {
4966        let hi = &[5., 6., 7., 8.];
4967        let hiaddr = hi.as_ptr();
4968        let lo = &[1., 2., 3., 4.];
4969        let loaddr = lo.as_ptr();
4970        let r = _mm256_loadu2_m128(hiaddr, loaddr);
4971        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4972        assert_eq_m256(r, e);
4973    }
4974
4975    #[simd_test(enable = "avx")]
4976    unsafe fn test_mm256_loadu2_m128d() {
4977        let hi = &[3., 4.];
4978        let hiaddr = hi.as_ptr();
4979        let lo = &[1., 2.];
4980        let loaddr = lo.as_ptr();
4981        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4982        let e = _mm256_setr_pd(1., 2., 3., 4.);
4983        assert_eq_m256d(r, e);
4984    }
4985
4986    #[simd_test(enable = "avx")]
4987    unsafe fn test_mm256_loadu2_m128i() {
4988        #[rustfmt::skip]
4989        let hi = _mm_setr_epi8(
4990            17, 18, 19, 20, 21, 22, 23, 24,
4991            25, 26, 27, 28, 29, 30, 31, 32,
4992        );
4993        #[rustfmt::skip]
4994        let lo = _mm_setr_epi8(
4995            1, 2, 3, 4, 5, 6, 7, 8,
4996            9, 10, 11, 12, 13, 14, 15, 16,
4997        );
4998        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
4999        #[rustfmt::skip]
5000        let e = _mm256_setr_epi8(
5001            1, 2, 3, 4, 5, 6, 7, 8,
5002            9, 10, 11, 12, 13, 14, 15, 16,
5003            17, 18, 19, 20, 21, 22, 23, 24,
5004            25, 26, 27, 28, 29, 30, 31, 32,
5005        );
5006        assert_eq_m256i(r, e);
5007    }
5008
5009    #[simd_test(enable = "avx")]
5010    unsafe fn test_mm256_storeu2_m128() {
5011        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5012        let mut hi = _mm_undefined_ps();
5013        let mut lo = _mm_undefined_ps();
5014        _mm256_storeu2_m128(
5015            ptr::addr_of_mut!(hi) as *mut f32,
5016            ptr::addr_of_mut!(lo) as *mut f32,
5017            a,
5018        );
5019        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
5020        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
5021    }
5022
5023    #[simd_test(enable = "avx")]
5024    unsafe fn test_mm256_storeu2_m128d() {
5025        let a = _mm256_setr_pd(1., 2., 3., 4.);
5026        let mut hi = _mm_undefined_pd();
5027        let mut lo = _mm_undefined_pd();
5028        _mm256_storeu2_m128d(
5029            ptr::addr_of_mut!(hi) as *mut f64,
5030            ptr::addr_of_mut!(lo) as *mut f64,
5031            a,
5032        );
5033        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
5034        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
5035    }
5036
5037    #[simd_test(enable = "avx")]
5038    unsafe fn test_mm256_storeu2_m128i() {
5039        #[rustfmt::skip]
5040        let a = _mm256_setr_epi8(
5041            1, 2, 3, 4, 5, 6, 7, 8,
5042            9, 10, 11, 12, 13, 14, 15, 16,
5043            17, 18, 19, 20, 21, 22, 23, 24,
5044            25, 26, 27, 28, 29, 30, 31, 32,
5045        );
5046        let mut hi = _mm_undefined_si128();
5047        let mut lo = _mm_undefined_si128();
5048        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
5049        #[rustfmt::skip]
5050        let e_hi = _mm_setr_epi8(
5051            17, 18, 19, 20, 21, 22, 23, 24,
5052            25, 26, 27, 28, 29, 30, 31, 32
5053        );
5054        #[rustfmt::skip]
5055        let e_lo = _mm_setr_epi8(
5056            1, 2, 3, 4, 5, 6, 7, 8,
5057            9, 10, 11, 12, 13, 14, 15, 16
5058        );
5059
5060        assert_eq_m128i(hi, e_hi);
5061        assert_eq_m128i(lo, e_lo);
5062    }
5063
5064    #[simd_test(enable = "avx")]
5065    unsafe fn test_mm256_cvtss_f32() {
5066        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5067        let r = _mm256_cvtss_f32(a);
5068        assert_eq!(r, 1.);
5069    }
5070}