Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: https://docs.amd.com/v/u/en-US/24594_3.37
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
35pub const fn _mm256_abs_epi32(a: __m256i) -> __m256i {
36    unsafe {
37        let a = a.as_i32x8();
38        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
39        transmute(r)
40    }
41}
42
43/// Computes the absolute values of packed 16-bit integers in `a`.
44///
45/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
46#[inline]
47#[target_feature(enable = "avx2")]
48#[cfg_attr(test, assert_instr(vpabsw))]
49#[stable(feature = "simd_x86", since = "1.27.0")]
50#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
51pub const fn _mm256_abs_epi16(a: __m256i) -> __m256i {
52    unsafe {
53        let a = a.as_i16x16();
54        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
55        transmute(r)
56    }
57}
58
59/// Computes the absolute values of packed 8-bit integers in `a`.
60///
61/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
62#[inline]
63#[target_feature(enable = "avx2")]
64#[cfg_attr(test, assert_instr(vpabsb))]
65#[stable(feature = "simd_x86", since = "1.27.0")]
66#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
67pub const fn _mm256_abs_epi8(a: __m256i) -> __m256i {
68    unsafe {
69        let a = a.as_i8x32();
70        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
71        transmute(r)
72    }
73}
74
75/// Adds packed 64-bit integers in `a` and `b`.
76///
77/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
78#[inline]
79#[target_feature(enable = "avx2")]
80#[cfg_attr(test, assert_instr(vpaddq))]
81#[stable(feature = "simd_x86", since = "1.27.0")]
82#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
83pub const fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
84    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
85}
86
87/// Adds packed 32-bit integers in `a` and `b`.
88///
89/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
90#[inline]
91#[target_feature(enable = "avx2")]
92#[cfg_attr(test, assert_instr(vpaddd))]
93#[stable(feature = "simd_x86", since = "1.27.0")]
94#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
95pub const fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
96    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
97}
98
99/// Adds packed 16-bit integers in `a` and `b`.
100///
101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
102#[inline]
103#[target_feature(enable = "avx2")]
104#[cfg_attr(test, assert_instr(vpaddw))]
105#[stable(feature = "simd_x86", since = "1.27.0")]
106#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
107pub const fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
108    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
109}
110
111/// Adds packed 8-bit integers in `a` and `b`.
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
114#[inline]
115#[target_feature(enable = "avx2")]
116#[cfg_attr(test, assert_instr(vpaddb))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
119pub const fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
120    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
121}
122
123/// Adds packed 8-bit integers in `a` and `b` using saturation.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
126#[inline]
127#[target_feature(enable = "avx2")]
128#[cfg_attr(test, assert_instr(vpaddsb))]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
131pub const fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
132    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
133}
134
135/// Adds packed 16-bit integers in `a` and `b` using saturation.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
138#[inline]
139#[target_feature(enable = "avx2")]
140#[cfg_attr(test, assert_instr(vpaddsw))]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
143pub const fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
144    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
145}
146
147/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
148///
149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
150#[inline]
151#[target_feature(enable = "avx2")]
152#[cfg_attr(test, assert_instr(vpaddusb))]
153#[stable(feature = "simd_x86", since = "1.27.0")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
156    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
157}
158
159/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
162#[inline]
163#[target_feature(enable = "avx2")]
164#[cfg_attr(test, assert_instr(vpaddusw))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
167pub const fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
168    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
169}
170
171/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
172/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
173///
174/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
175#[inline]
176#[target_feature(enable = "avx2")]
177#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
178#[rustc_legacy_const_generics(2)]
179#[stable(feature = "simd_x86", since = "1.27.0")]
180#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
181pub const fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
182    static_assert_uimm_bits!(IMM8, 8);
183
184    // If palignr is shifting the pair of vectors more than the size of two
185    // lanes, emit zero.
186    if IMM8 >= 32 {
187        return _mm256_setzero_si256();
188    }
189    // If palignr is shifting the pair of input vectors more than one lane,
190    // but less than two lanes, convert to shifting in zeroes.
191    let (a, b) = if IMM8 > 16 {
192        (_mm256_setzero_si256(), a)
193    } else {
194        (a, b)
195    };
196    unsafe {
197        if IMM8 == 16 {
198            return transmute(a);
199        }
200    }
201    const fn mask(shift: u32, i: u32) -> u32 {
202        let shift = shift % 16;
203        let mod_i = i % 16;
204        if mod_i < (16 - shift) {
205            i + shift
206        } else {
207            i + 16 + shift
208        }
209    }
210
211    unsafe {
212        let r: i8x32 = simd_shuffle!(
213            b.as_i8x32(),
214            a.as_i8x32(),
215            [
216                mask(IMM8 as u32, 0),
217                mask(IMM8 as u32, 1),
218                mask(IMM8 as u32, 2),
219                mask(IMM8 as u32, 3),
220                mask(IMM8 as u32, 4),
221                mask(IMM8 as u32, 5),
222                mask(IMM8 as u32, 6),
223                mask(IMM8 as u32, 7),
224                mask(IMM8 as u32, 8),
225                mask(IMM8 as u32, 9),
226                mask(IMM8 as u32, 10),
227                mask(IMM8 as u32, 11),
228                mask(IMM8 as u32, 12),
229                mask(IMM8 as u32, 13),
230                mask(IMM8 as u32, 14),
231                mask(IMM8 as u32, 15),
232                mask(IMM8 as u32, 16),
233                mask(IMM8 as u32, 17),
234                mask(IMM8 as u32, 18),
235                mask(IMM8 as u32, 19),
236                mask(IMM8 as u32, 20),
237                mask(IMM8 as u32, 21),
238                mask(IMM8 as u32, 22),
239                mask(IMM8 as u32, 23),
240                mask(IMM8 as u32, 24),
241                mask(IMM8 as u32, 25),
242                mask(IMM8 as u32, 26),
243                mask(IMM8 as u32, 27),
244                mask(IMM8 as u32, 28),
245                mask(IMM8 as u32, 29),
246                mask(IMM8 as u32, 30),
247                mask(IMM8 as u32, 31),
248            ],
249        );
250        transmute(r)
251    }
252}
253
254/// Computes the bitwise AND of 256 bits (representing integer data)
255/// in `a` and `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
264    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
265}
266
267/// Computes the bitwise NOT of 256 bits (representing integer data)
268/// in `a` and then AND with `b`.
269///
270/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
271#[inline]
272#[target_feature(enable = "avx2")]
273#[cfg_attr(test, assert_instr(vandnps))]
274#[stable(feature = "simd_x86", since = "1.27.0")]
275#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
276pub const fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
277    unsafe {
278        let all_ones = _mm256_set1_epi8(-1);
279        transmute(simd_and(
280            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
281            b.as_i64x4(),
282        ))
283    }
284}
285
286/// Averages packed unsigned 16-bit integers in `a` and `b`.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
289#[inline]
290#[target_feature(enable = "avx2")]
291#[cfg_attr(test, assert_instr(vpavgw))]
292#[stable(feature = "simd_x86", since = "1.27.0")]
293#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
294pub const fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
295    unsafe {
296        let a = simd_cast::<_, u32x16>(a.as_u16x16());
297        let b = simd_cast::<_, u32x16>(b.as_u16x16());
298        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
299        transmute(simd_cast::<_, u16x16>(r))
300    }
301}
302
303/// Averages packed unsigned 8-bit integers in `a` and `b`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
306#[inline]
307#[target_feature(enable = "avx2")]
308#[cfg_attr(test, assert_instr(vpavgb))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
311pub const fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
312    unsafe {
313        let a = simd_cast::<_, u16x32>(a.as_u8x32());
314        let b = simd_cast::<_, u16x32>(b.as_u8x32());
315        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
316        transmute(simd_cast::<_, u8x32>(r))
317    }
318}
319
320/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
321///
322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
323#[inline]
324#[target_feature(enable = "avx2")]
325#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
326#[rustc_legacy_const_generics(2)]
327#[stable(feature = "simd_x86", since = "1.27.0")]
328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
329pub const fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
330    static_assert_uimm_bits!(IMM4, 4);
331    unsafe {
332        let a = a.as_i32x4();
333        let b = b.as_i32x4();
334        let r: i32x4 = simd_shuffle!(
335            a,
336            b,
337            [
338                [0, 4, 0, 4][IMM4 as usize & 0b11],
339                [1, 1, 5, 5][IMM4 as usize & 0b11],
340                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
341                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
342            ],
343        );
344        transmute(r)
345    }
346}
347
348/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
349///
350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
351#[inline]
352#[target_feature(enable = "avx2")]
353#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
354#[rustc_legacy_const_generics(2)]
355#[stable(feature = "simd_x86", since = "1.27.0")]
356#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
357pub const fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
358    static_assert_uimm_bits!(IMM8, 8);
359    unsafe {
360        let a = a.as_i32x8();
361        let b = b.as_i32x8();
362        let r: i32x8 = simd_shuffle!(
363            a,
364            b,
365            [
366                [0, 8, 0, 8][IMM8 as usize & 0b11],
367                [1, 1, 9, 9][IMM8 as usize & 0b11],
368                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
369                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
370                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
371                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
372                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
373                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
374            ],
375        );
376        transmute(r)
377    }
378}
379
380/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
381///
382/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
383#[inline]
384#[target_feature(enable = "avx2")]
385#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
386#[rustc_legacy_const_generics(2)]
387#[stable(feature = "simd_x86", since = "1.27.0")]
388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
389pub const fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
390    static_assert_uimm_bits!(IMM8, 8);
391    unsafe {
392        let a = a.as_i16x16();
393        let b = b.as_i16x16();
394
395        let r: i16x16 = simd_shuffle!(
396            a,
397            b,
398            [
399                [0, 16, 0, 16][IMM8 as usize & 0b11],
400                [1, 1, 17, 17][IMM8 as usize & 0b11],
401                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
402                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
403                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
404                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
405                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
406                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
407                [8, 24, 8, 24][IMM8 as usize & 0b11],
408                [9, 9, 25, 25][IMM8 as usize & 0b11],
409                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
410                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
411                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
412                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
413                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
414                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
415            ],
416        );
417        transmute(r)
418    }
419}
420
421/// Blends packed 8-bit integers from `a` and `b` using `mask`.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
424#[inline]
425#[target_feature(enable = "avx2")]
426#[cfg_attr(test, assert_instr(vpblendvb))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
429pub const fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
430    unsafe {
431        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
432        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
433    }
434}
435
436/// Broadcasts the low packed 8-bit integer from `a` to all elements of
437/// the 128-bit returned value.
438///
439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
440#[inline]
441#[target_feature(enable = "avx2")]
442#[cfg_attr(test, assert_instr(vpbroadcastb))]
443#[stable(feature = "simd_x86", since = "1.27.0")]
444#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
445pub const fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
446    unsafe {
447        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
448        transmute::<i8x16, _>(ret)
449    }
450}
451
452/// Broadcasts the low packed 8-bit integer from `a` to all elements of
453/// the 256-bit returned value.
454///
455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
456#[inline]
457#[target_feature(enable = "avx2")]
458#[cfg_attr(test, assert_instr(vpbroadcastb))]
459#[stable(feature = "simd_x86", since = "1.27.0")]
460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
461pub const fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
462    unsafe {
463        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
464        transmute::<i8x32, _>(ret)
465    }
466}
467
468// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
469// often compiled to `vbroadcastss`.
470/// Broadcasts the low packed 32-bit integer from `a` to all elements of
471/// the 128-bit returned value.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
474#[inline]
475#[target_feature(enable = "avx2")]
476#[cfg_attr(test, assert_instr(vbroadcastss))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
479pub const fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
480    unsafe {
481        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
482        transmute::<i32x4, _>(ret)
483    }
484}
485
486// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
487// often compiled to `vbroadcastss`.
488/// Broadcasts the low packed 32-bit integer from `a` to all elements of
489/// the 256-bit returned value.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
492#[inline]
493#[target_feature(enable = "avx2")]
494#[cfg_attr(test, assert_instr(vbroadcastss))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
497pub const fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
498    unsafe {
499        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
500        transmute::<i32x8, _>(ret)
501    }
502}
503
504/// Broadcasts the low packed 64-bit integer from `a` to all elements of
505/// the 128-bit returned value.
506///
507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
508#[inline]
509#[target_feature(enable = "avx2")]
510// Emits `vmovddup` instead of `vpbroadcastq`
511// See https://github.com/rust-lang/stdarch/issues/791
512#[cfg_attr(test, assert_instr(vmovddup))]
513#[stable(feature = "simd_x86", since = "1.27.0")]
514#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
515pub const fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
516    unsafe {
517        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
518        transmute::<i64x2, _>(ret)
519    }
520}
521
522/// Broadcasts the low packed 64-bit integer from `a` to all elements of
523/// the 256-bit returned value.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
526#[inline]
527#[target_feature(enable = "avx2")]
528#[cfg_attr(test, assert_instr(vbroadcastsd))]
529#[stable(feature = "simd_x86", since = "1.27.0")]
530#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
531pub const fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
532    unsafe {
533        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
534        transmute::<i64x4, _>(ret)
535    }
536}
537
538/// Broadcasts the low double-precision (64-bit) floating-point element
539/// from `a` to all elements of the 128-bit returned value.
540///
541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
542#[inline]
543#[target_feature(enable = "avx2")]
544#[cfg_attr(test, assert_instr(vmovddup))]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
547pub const fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
548    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
549}
550
551/// Broadcasts the low double-precision (64-bit) floating-point element
552/// from `a` to all elements of the 256-bit returned value.
553///
554/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
555#[inline]
556#[target_feature(enable = "avx2")]
557#[cfg_attr(test, assert_instr(vbroadcastsd))]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
560pub const fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
561    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
562}
563
564/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
565/// the 256-bit returned value.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
568#[inline]
569#[target_feature(enable = "avx2")]
570#[stable(feature = "simd_x86_updates", since = "1.82.0")]
571#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
572pub const fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
573    unsafe {
574        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
575        transmute::<i64x4, _>(ret)
576    }
577}
578
579// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
580// `vbroadcastf128`.
581/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
582/// the 256-bit returned value.
583///
584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
585#[inline]
586#[target_feature(enable = "avx2")]
587#[stable(feature = "simd_x86", since = "1.27.0")]
588#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
589pub const fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
590    unsafe {
591        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
592        transmute::<i64x4, _>(ret)
593    }
594}
595
596/// Broadcasts the low single-precision (32-bit) floating-point element
597/// from `a` to all elements of the 128-bit returned value.
598///
599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
600#[inline]
601#[target_feature(enable = "avx2")]
602#[cfg_attr(test, assert_instr(vbroadcastss))]
603#[stable(feature = "simd_x86", since = "1.27.0")]
604#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
605pub const fn _mm_broadcastss_ps(a: __m128) -> __m128 {
606    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
607}
608
609/// Broadcasts the low single-precision (32-bit) floating-point element
610/// from `a` to all elements of the 256-bit returned value.
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
613#[inline]
614#[target_feature(enable = "avx2")]
615#[cfg_attr(test, assert_instr(vbroadcastss))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
618pub const fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
619    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
620}
621
622/// Broadcasts the low packed 16-bit integer from a to all elements of
623/// the 128-bit returned value
624///
625/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
626#[inline]
627#[target_feature(enable = "avx2")]
628#[cfg_attr(test, assert_instr(vpbroadcastw))]
629#[stable(feature = "simd_x86", since = "1.27.0")]
630#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
631pub const fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
632    unsafe {
633        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
634        transmute::<i16x8, _>(ret)
635    }
636}
637
638/// Broadcasts the low packed 16-bit integer from a to all elements of
639/// the 256-bit returned value
640///
641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
642#[inline]
643#[target_feature(enable = "avx2")]
644#[cfg_attr(test, assert_instr(vpbroadcastw))]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
647pub const fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
648    unsafe {
649        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
650        transmute::<i16x16, _>(ret)
651    }
652}
653
654/// Compares packed 64-bit integers in `a` and `b` for equality.
655///
656/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
657#[inline]
658#[target_feature(enable = "avx2")]
659#[cfg_attr(test, assert_instr(vpcmpeqq))]
660#[stable(feature = "simd_x86", since = "1.27.0")]
661#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
662pub const fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
663    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
664}
665
666/// Compares packed 32-bit integers in `a` and `b` for equality.
667///
668/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
669#[inline]
670#[target_feature(enable = "avx2")]
671#[cfg_attr(test, assert_instr(vpcmpeqd))]
672#[stable(feature = "simd_x86", since = "1.27.0")]
673#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
674pub const fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
675    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
676}
677
678/// Compares packed 16-bit integers in `a` and `b` for equality.
679///
680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
681#[inline]
682#[target_feature(enable = "avx2")]
683#[cfg_attr(test, assert_instr(vpcmpeqw))]
684#[stable(feature = "simd_x86", since = "1.27.0")]
685#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
686pub const fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
687    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
688}
689
690/// Compares packed 8-bit integers in `a` and `b` for equality.
691///
692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
693#[inline]
694#[target_feature(enable = "avx2")]
695#[cfg_attr(test, assert_instr(vpcmpeqb))]
696#[stable(feature = "simd_x86", since = "1.27.0")]
697#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
698pub const fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
699    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
700}
701
702/// Compares packed 64-bit integers in `a` and `b` for greater-than.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
705#[inline]
706#[target_feature(enable = "avx2")]
707#[cfg_attr(test, assert_instr(vpcmpgtq))]
708#[stable(feature = "simd_x86", since = "1.27.0")]
709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
710pub const fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
711    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
712}
713
714/// Compares packed 32-bit integers in `a` and `b` for greater-than.
715///
716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
717#[inline]
718#[target_feature(enable = "avx2")]
719#[cfg_attr(test, assert_instr(vpcmpgtd))]
720#[stable(feature = "simd_x86", since = "1.27.0")]
721#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
722pub const fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
723    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
724}
725
726/// Compares packed 16-bit integers in `a` and `b` for greater-than.
727///
728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
729#[inline]
730#[target_feature(enable = "avx2")]
731#[cfg_attr(test, assert_instr(vpcmpgtw))]
732#[stable(feature = "simd_x86", since = "1.27.0")]
733#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
734pub const fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
735    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
736}
737
738/// Compares packed 8-bit integers in `a` and `b` for greater-than.
739///
740/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
741#[inline]
742#[target_feature(enable = "avx2")]
743#[cfg_attr(test, assert_instr(vpcmpgtb))]
744#[stable(feature = "simd_x86", since = "1.27.0")]
745#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
746pub const fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
747    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
748}
749
750/// Sign-extend 16-bit integers to 32-bit integers.
751///
752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
753#[inline]
754#[target_feature(enable = "avx2")]
755#[cfg_attr(test, assert_instr(vpmovsxwd))]
756#[stable(feature = "simd_x86", since = "1.27.0")]
757#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
758pub const fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
759    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
760}
761
762/// Sign-extend 16-bit integers to 64-bit integers.
763///
764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
765#[inline]
766#[target_feature(enable = "avx2")]
767#[cfg_attr(test, assert_instr(vpmovsxwq))]
768#[stable(feature = "simd_x86", since = "1.27.0")]
769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
770pub const fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
771    unsafe {
772        let a = a.as_i16x8();
773        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
774        transmute::<i64x4, _>(simd_cast(v64))
775    }
776}
777
778/// Sign-extend 32-bit integers to 64-bit integers.
779///
780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
781#[inline]
782#[target_feature(enable = "avx2")]
783#[cfg_attr(test, assert_instr(vpmovsxdq))]
784#[stable(feature = "simd_x86", since = "1.27.0")]
785#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
786pub const fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
787    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
788}
789
790/// Sign-extend 8-bit integers to 16-bit integers.
791///
792/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
793#[inline]
794#[target_feature(enable = "avx2")]
795#[cfg_attr(test, assert_instr(vpmovsxbw))]
796#[stable(feature = "simd_x86", since = "1.27.0")]
797#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
798pub const fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
799    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
800}
801
802/// Sign-extend 8-bit integers to 32-bit integers.
803///
804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
805#[inline]
806#[target_feature(enable = "avx2")]
807#[cfg_attr(test, assert_instr(vpmovsxbd))]
808#[stable(feature = "simd_x86", since = "1.27.0")]
809#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
810pub const fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
811    unsafe {
812        let a = a.as_i8x16();
813        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
814        transmute::<i32x8, _>(simd_cast(v64))
815    }
816}
817
818/// Sign-extend 8-bit integers to 64-bit integers.
819///
820/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
821#[inline]
822#[target_feature(enable = "avx2")]
823#[cfg_attr(test, assert_instr(vpmovsxbq))]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
826pub const fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
827    unsafe {
828        let a = a.as_i8x16();
829        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
830        transmute::<i64x4, _>(simd_cast(v32))
831    }
832}
833
834/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
835/// integers, and stores the results in `dst`.
836///
837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
838#[inline]
839#[target_feature(enable = "avx2")]
840#[cfg_attr(test, assert_instr(vpmovzxwd))]
841#[stable(feature = "simd_x86", since = "1.27.0")]
842#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
843pub const fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
844    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
845}
846
847/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
848/// integers. The upper four elements of `a` are unused.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
851#[inline]
852#[target_feature(enable = "avx2")]
853#[cfg_attr(test, assert_instr(vpmovzxwq))]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
856pub const fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
857    unsafe {
858        let a = a.as_u16x8();
859        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
860        transmute::<i64x4, _>(simd_cast(v64))
861    }
862}
863
864/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
865///
866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
867#[inline]
868#[target_feature(enable = "avx2")]
869#[cfg_attr(test, assert_instr(vpmovzxdq))]
870#[stable(feature = "simd_x86", since = "1.27.0")]
871#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
872pub const fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
873    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
874}
875
876/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
877///
878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
879#[inline]
880#[target_feature(enable = "avx2")]
881#[cfg_attr(test, assert_instr(vpmovzxbw))]
882#[stable(feature = "simd_x86", since = "1.27.0")]
883#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
884pub const fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
885    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
886}
887
888/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
889/// integers. The upper eight elements of `a` are unused.
890///
891/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
892#[inline]
893#[target_feature(enable = "avx2")]
894#[cfg_attr(test, assert_instr(vpmovzxbd))]
895#[stable(feature = "simd_x86", since = "1.27.0")]
896#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
897pub const fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
898    unsafe {
899        let a = a.as_u8x16();
900        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
901        transmute::<i32x8, _>(simd_cast(v64))
902    }
903}
904
905/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
906/// integers. The upper twelve elements of `a` are unused.
907///
908/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
909#[inline]
910#[target_feature(enable = "avx2")]
911#[cfg_attr(test, assert_instr(vpmovzxbq))]
912#[stable(feature = "simd_x86", since = "1.27.0")]
913#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
914pub const fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
915    unsafe {
916        let a = a.as_u8x16();
917        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
918        transmute::<i64x4, _>(simd_cast(v32))
919    }
920}
921
922/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
925#[inline]
926#[target_feature(enable = "avx2")]
927#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
928#[rustc_legacy_const_generics(1)]
929#[stable(feature = "simd_x86", since = "1.27.0")]
930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
931pub const fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
932    static_assert_uimm_bits!(IMM1, 1);
933    unsafe {
934        let a = a.as_i64x4();
935        let b = i64x4::ZERO;
936        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
937        transmute(dst)
938    }
939}
940
941/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
944#[inline]
945#[target_feature(enable = "avx2")]
946#[cfg_attr(test, assert_instr(vphaddw))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
949pub const fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
950    let a = a.as_i16x16();
951    let b = b.as_i16x16();
952    unsafe {
953        let even: i16x16 = simd_shuffle!(
954            a,
955            b,
956            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
957        );
958        let odd: i16x16 = simd_shuffle!(
959            a,
960            b,
961            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
962        );
963        simd_add(even, odd).as_m256i()
964    }
965}
966
967/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
970#[inline]
971#[target_feature(enable = "avx2")]
972#[cfg_attr(test, assert_instr(vphaddd))]
973#[stable(feature = "simd_x86", since = "1.27.0")]
974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
975pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
976    let a = a.as_i32x8();
977    let b = b.as_i32x8();
978    unsafe {
979        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
980        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
981        simd_add(even, odd).as_m256i()
982    }
983}
984
985/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
986/// using saturation.
987///
988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
989#[inline]
990#[target_feature(enable = "avx2")]
991#[cfg_attr(test, assert_instr(vphaddsw))]
992#[stable(feature = "simd_x86", since = "1.27.0")]
993pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
994    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
995}
996
997/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
998///
999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
1000#[inline]
1001#[target_feature(enable = "avx2")]
1002#[cfg_attr(test, assert_instr(vphsubw))]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1005pub const fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1006    let a = a.as_i16x16();
1007    let b = b.as_i16x16();
1008    unsafe {
1009        let even: i16x16 = simd_shuffle!(
1010            a,
1011            b,
1012            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1013        );
1014        let odd: i16x16 = simd_shuffle!(
1015            a,
1016            b,
1017            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1018        );
1019        simd_sub(even, odd).as_m256i()
1020    }
1021}
1022
1023/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1024///
1025/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
1026#[inline]
1027#[target_feature(enable = "avx2")]
1028#[cfg_attr(test, assert_instr(vphsubd))]
1029#[stable(feature = "simd_x86", since = "1.27.0")]
1030#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1031pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1032    let a = a.as_i32x8();
1033    let b = b.as_i32x8();
1034    unsafe {
1035        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
1036        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
1037        simd_sub(even, odd).as_m256i()
1038    }
1039}
1040
1041/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1042/// using saturation.
1043///
1044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
1045#[inline]
1046#[target_feature(enable = "avx2")]
1047#[cfg_attr(test, assert_instr(vphsubsw))]
1048#[stable(feature = "simd_x86", since = "1.27.0")]
1049pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1050    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
1051}
1052
1053/// Returns values from `slice` at offsets determined by `offsets * scale`,
1054/// where
1055/// `scale` should be 1, 2, 4 or 8.
1056///
1057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
1058#[inline]
1059#[target_feature(enable = "avx2")]
1060#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1061#[rustc_legacy_const_generics(2)]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1064    slice: *const i32,
1065    offsets: __m128i,
1066) -> __m128i {
1067    static_assert_imm8_scale!(SCALE);
1068    let zero = i32x4::ZERO;
1069    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1070    let offsets = offsets.as_i32x4();
1071    let slice = slice as *const i8;
1072    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1073    transmute(r)
1074}
1075
1076/// Returns values from `slice` at offsets determined by `offsets * scale`,
1077/// where
1078/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1079/// that position instead.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1082#[inline]
1083#[target_feature(enable = "avx2")]
1084#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1085#[rustc_legacy_const_generics(4)]
1086#[stable(feature = "simd_x86", since = "1.27.0")]
1087pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1088    src: __m128i,
1089    slice: *const i32,
1090    offsets: __m128i,
1091    mask: __m128i,
1092) -> __m128i {
1093    static_assert_imm8_scale!(SCALE);
1094    let src = src.as_i32x4();
1095    let mask = mask.as_i32x4();
1096    let offsets = offsets.as_i32x4();
1097    let slice = slice as *const i8;
1098    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1099    transmute(r)
1100}
1101
1102/// Returns values from `slice` at offsets determined by `offsets * scale`,
1103/// where
1104/// `scale` should be 1, 2, 4 or 8.
1105///
1106/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1107#[inline]
1108#[target_feature(enable = "avx2")]
1109#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1110#[rustc_legacy_const_generics(2)]
1111#[stable(feature = "simd_x86", since = "1.27.0")]
1112pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1113    slice: *const i32,
1114    offsets: __m256i,
1115) -> __m256i {
1116    static_assert_imm8_scale!(SCALE);
1117    let zero = i32x8::ZERO;
1118    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1119    let offsets = offsets.as_i32x8();
1120    let slice = slice as *const i8;
1121    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1122    transmute(r)
1123}
1124
1125/// Returns values from `slice` at offsets determined by `offsets * scale`,
1126/// where
1127/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1128/// that position instead.
1129///
1130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1131#[inline]
1132#[target_feature(enable = "avx2")]
1133#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1134#[rustc_legacy_const_generics(4)]
1135#[stable(feature = "simd_x86", since = "1.27.0")]
1136pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1137    src: __m256i,
1138    slice: *const i32,
1139    offsets: __m256i,
1140    mask: __m256i,
1141) -> __m256i {
1142    static_assert_imm8_scale!(SCALE);
1143    let src = src.as_i32x8();
1144    let mask = mask.as_i32x8();
1145    let offsets = offsets.as_i32x8();
1146    let slice = slice as *const i8;
1147    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1148    transmute(r)
1149}
1150
1151/// Returns values from `slice` at offsets determined by `offsets * scale`,
1152/// where
1153/// `scale` should be 1, 2, 4 or 8.
1154///
1155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1156#[inline]
1157#[target_feature(enable = "avx2")]
1158#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1159#[rustc_legacy_const_generics(2)]
1160#[stable(feature = "simd_x86", since = "1.27.0")]
1161pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1162    static_assert_imm8_scale!(SCALE);
1163    let zero = _mm_setzero_ps();
1164    let neg_one = _mm_set1_ps(-1.0);
1165    let offsets = offsets.as_i32x4();
1166    let slice = slice as *const i8;
1167    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1168}
1169
1170/// Returns values from `slice` at offsets determined by `offsets * scale`,
1171/// where
1172/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1173/// that position instead.
1174///
1175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1176#[inline]
1177#[target_feature(enable = "avx2")]
1178#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1179#[rustc_legacy_const_generics(4)]
1180#[stable(feature = "simd_x86", since = "1.27.0")]
1181pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1182    src: __m128,
1183    slice: *const f32,
1184    offsets: __m128i,
1185    mask: __m128,
1186) -> __m128 {
1187    static_assert_imm8_scale!(SCALE);
1188    let offsets = offsets.as_i32x4();
1189    let slice = slice as *const i8;
1190    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1191}
1192
1193/// Returns values from `slice` at offsets determined by `offsets * scale`,
1194/// where
1195/// `scale` should be 1, 2, 4 or 8.
1196///
1197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1198#[inline]
1199#[target_feature(enable = "avx2")]
1200#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1201#[rustc_legacy_const_generics(2)]
1202#[stable(feature = "simd_x86", since = "1.27.0")]
1203pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1204    static_assert_imm8_scale!(SCALE);
1205    let zero = _mm256_setzero_ps();
1206    let neg_one = _mm256_set1_ps(-1.0);
1207    let offsets = offsets.as_i32x8();
1208    let slice = slice as *const i8;
1209    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1210}
1211
1212/// Returns values from `slice` at offsets determined by `offsets * scale`,
1213/// where
1214/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1215/// that position instead.
1216///
1217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1218#[inline]
1219#[target_feature(enable = "avx2")]
1220#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1221#[rustc_legacy_const_generics(4)]
1222#[stable(feature = "simd_x86", since = "1.27.0")]
1223pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1224    src: __m256,
1225    slice: *const f32,
1226    offsets: __m256i,
1227    mask: __m256,
1228) -> __m256 {
1229    static_assert_imm8_scale!(SCALE);
1230    let offsets = offsets.as_i32x8();
1231    let slice = slice as *const i8;
1232    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1233}
1234
1235/// Returns values from `slice` at offsets determined by `offsets * scale`,
1236/// where
1237/// `scale` should be 1, 2, 4 or 8.
1238///
1239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1240#[inline]
1241#[target_feature(enable = "avx2")]
1242#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1243#[rustc_legacy_const_generics(2)]
1244#[stable(feature = "simd_x86", since = "1.27.0")]
1245pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1246    slice: *const i64,
1247    offsets: __m128i,
1248) -> __m128i {
1249    static_assert_imm8_scale!(SCALE);
1250    let zero = i64x2::ZERO;
1251    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1252    let offsets = offsets.as_i32x4();
1253    let slice = slice as *const i8;
1254    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1255    transmute(r)
1256}
1257
1258/// Returns values from `slice` at offsets determined by `offsets * scale`,
1259/// where
1260/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1261/// that position instead.
1262///
1263/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1264#[inline]
1265#[target_feature(enable = "avx2")]
1266#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1267#[rustc_legacy_const_generics(4)]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1270    src: __m128i,
1271    slice: *const i64,
1272    offsets: __m128i,
1273    mask: __m128i,
1274) -> __m128i {
1275    static_assert_imm8_scale!(SCALE);
1276    let src = src.as_i64x2();
1277    let mask = mask.as_i64x2();
1278    let offsets = offsets.as_i32x4();
1279    let slice = slice as *const i8;
1280    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1281    transmute(r)
1282}
1283
1284/// Returns values from `slice` at offsets determined by `offsets * scale`,
1285/// where
1286/// `scale` should be 1, 2, 4 or 8.
1287///
1288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1289#[inline]
1290#[target_feature(enable = "avx2")]
1291#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1292#[rustc_legacy_const_generics(2)]
1293#[stable(feature = "simd_x86", since = "1.27.0")]
1294pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1295    slice: *const i64,
1296    offsets: __m128i,
1297) -> __m256i {
1298    static_assert_imm8_scale!(SCALE);
1299    let zero = i64x4::ZERO;
1300    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1301    let offsets = offsets.as_i32x4();
1302    let slice = slice as *const i8;
1303    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1304    transmute(r)
1305}
1306
1307/// Returns values from `slice` at offsets determined by `offsets * scale`,
1308/// where
1309/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1310/// that position instead.
1311///
1312/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1313#[inline]
1314#[target_feature(enable = "avx2")]
1315#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1316#[rustc_legacy_const_generics(4)]
1317#[stable(feature = "simd_x86", since = "1.27.0")]
1318pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1319    src: __m256i,
1320    slice: *const i64,
1321    offsets: __m128i,
1322    mask: __m256i,
1323) -> __m256i {
1324    static_assert_imm8_scale!(SCALE);
1325    let src = src.as_i64x4();
1326    let mask = mask.as_i64x4();
1327    let offsets = offsets.as_i32x4();
1328    let slice = slice as *const i8;
1329    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1330    transmute(r)
1331}
1332
1333/// Returns values from `slice` at offsets determined by `offsets * scale`,
1334/// where
1335/// `scale` should be 1, 2, 4 or 8.
1336///
1337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1338#[inline]
1339#[target_feature(enable = "avx2")]
1340#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1341#[rustc_legacy_const_generics(2)]
1342#[stable(feature = "simd_x86", since = "1.27.0")]
1343pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1344    static_assert_imm8_scale!(SCALE);
1345    let zero = _mm_setzero_pd();
1346    let neg_one = _mm_set1_pd(-1.0);
1347    let offsets = offsets.as_i32x4();
1348    let slice = slice as *const i8;
1349    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1350}
1351
1352/// Returns values from `slice` at offsets determined by `offsets * scale`,
1353/// where
1354/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1355/// that position instead.
1356///
1357/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1358#[inline]
1359#[target_feature(enable = "avx2")]
1360#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1361#[rustc_legacy_const_generics(4)]
1362#[stable(feature = "simd_x86", since = "1.27.0")]
1363pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1364    src: __m128d,
1365    slice: *const f64,
1366    offsets: __m128i,
1367    mask: __m128d,
1368) -> __m128d {
1369    static_assert_imm8_scale!(SCALE);
1370    let offsets = offsets.as_i32x4();
1371    let slice = slice as *const i8;
1372    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1373}
1374
1375/// Returns values from `slice` at offsets determined by `offsets * scale`,
1376/// where
1377/// `scale` should be 1, 2, 4 or 8.
1378///
1379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1380#[inline]
1381#[target_feature(enable = "avx2")]
1382#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1383#[rustc_legacy_const_generics(2)]
1384#[stable(feature = "simd_x86", since = "1.27.0")]
1385pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1386    slice: *const f64,
1387    offsets: __m128i,
1388) -> __m256d {
1389    static_assert_imm8_scale!(SCALE);
1390    let zero = _mm256_setzero_pd();
1391    let neg_one = _mm256_set1_pd(-1.0);
1392    let offsets = offsets.as_i32x4();
1393    let slice = slice as *const i8;
1394    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1395}
1396
1397/// Returns values from `slice` at offsets determined by `offsets * scale`,
1398/// where
1399/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1400/// that position instead.
1401///
1402/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1403#[inline]
1404#[target_feature(enable = "avx2")]
1405#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1406#[rustc_legacy_const_generics(4)]
1407#[stable(feature = "simd_x86", since = "1.27.0")]
1408pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1409    src: __m256d,
1410    slice: *const f64,
1411    offsets: __m128i,
1412    mask: __m256d,
1413) -> __m256d {
1414    static_assert_imm8_scale!(SCALE);
1415    let offsets = offsets.as_i32x4();
1416    let slice = slice as *const i8;
1417    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1418}
1419
1420/// Returns values from `slice` at offsets determined by `offsets * scale`,
1421/// where
1422/// `scale` should be 1, 2, 4 or 8.
1423///
1424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1425#[inline]
1426#[target_feature(enable = "avx2")]
1427#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1428#[rustc_legacy_const_generics(2)]
1429#[stable(feature = "simd_x86", since = "1.27.0")]
1430pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1431    slice: *const i32,
1432    offsets: __m128i,
1433) -> __m128i {
1434    static_assert_imm8_scale!(SCALE);
1435    let zero = i32x4::ZERO;
1436    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1437    let offsets = offsets.as_i64x2();
1438    let slice = slice as *const i8;
1439    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1440    transmute(r)
1441}
1442
1443/// Returns values from `slice` at offsets determined by `offsets * scale`,
1444/// where
1445/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1446/// that position instead.
1447///
1448/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1449#[inline]
1450#[target_feature(enable = "avx2")]
1451#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1452#[rustc_legacy_const_generics(4)]
1453#[stable(feature = "simd_x86", since = "1.27.0")]
1454pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1455    src: __m128i,
1456    slice: *const i32,
1457    offsets: __m128i,
1458    mask: __m128i,
1459) -> __m128i {
1460    static_assert_imm8_scale!(SCALE);
1461    let src = src.as_i32x4();
1462    let mask = mask.as_i32x4();
1463    let offsets = offsets.as_i64x2();
1464    let slice = slice as *const i8;
1465    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1466    transmute(r)
1467}
1468
1469/// Returns values from `slice` at offsets determined by `offsets * scale`,
1470/// where
1471/// `scale` should be 1, 2, 4 or 8.
1472///
1473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1474#[inline]
1475#[target_feature(enable = "avx2")]
1476#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1477#[rustc_legacy_const_generics(2)]
1478#[stable(feature = "simd_x86", since = "1.27.0")]
1479pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1480    slice: *const i32,
1481    offsets: __m256i,
1482) -> __m128i {
1483    static_assert_imm8_scale!(SCALE);
1484    let zero = i32x4::ZERO;
1485    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1486    let offsets = offsets.as_i64x4();
1487    let slice = slice as *const i8;
1488    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1489    transmute(r)
1490}
1491
1492/// Returns values from `slice` at offsets determined by `offsets * scale`,
1493/// where
1494/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1495/// that position instead.
1496///
1497/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1498#[inline]
1499#[target_feature(enable = "avx2")]
1500#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1501#[rustc_legacy_const_generics(4)]
1502#[stable(feature = "simd_x86", since = "1.27.0")]
1503pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1504    src: __m128i,
1505    slice: *const i32,
1506    offsets: __m256i,
1507    mask: __m128i,
1508) -> __m128i {
1509    static_assert_imm8_scale!(SCALE);
1510    let src = src.as_i32x4();
1511    let mask = mask.as_i32x4();
1512    let offsets = offsets.as_i64x4();
1513    let slice = slice as *const i8;
1514    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1515    transmute(r)
1516}
1517
1518/// Returns values from `slice` at offsets determined by `offsets * scale`,
1519/// where
1520/// `scale` should be 1, 2, 4 or 8.
1521///
1522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1523#[inline]
1524#[target_feature(enable = "avx2")]
1525#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1526#[rustc_legacy_const_generics(2)]
1527#[stable(feature = "simd_x86", since = "1.27.0")]
1528pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1529    static_assert_imm8_scale!(SCALE);
1530    let zero = _mm_setzero_ps();
1531    let neg_one = _mm_set1_ps(-1.0);
1532    let offsets = offsets.as_i64x2();
1533    let slice = slice as *const i8;
1534    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1535}
1536
1537/// Returns values from `slice` at offsets determined by `offsets * scale`,
1538/// where
1539/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1540/// that position instead.
1541///
1542/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1543#[inline]
1544#[target_feature(enable = "avx2")]
1545#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1546#[rustc_legacy_const_generics(4)]
1547#[stable(feature = "simd_x86", since = "1.27.0")]
1548pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1549    src: __m128,
1550    slice: *const f32,
1551    offsets: __m128i,
1552    mask: __m128,
1553) -> __m128 {
1554    static_assert_imm8_scale!(SCALE);
1555    let offsets = offsets.as_i64x2();
1556    let slice = slice as *const i8;
1557    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1558}
1559
1560/// Returns values from `slice` at offsets determined by `offsets * scale`,
1561/// where
1562/// `scale` should be 1, 2, 4 or 8.
1563///
1564/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1565#[inline]
1566#[target_feature(enable = "avx2")]
1567#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1568#[rustc_legacy_const_generics(2)]
1569#[stable(feature = "simd_x86", since = "1.27.0")]
1570pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1571    static_assert_imm8_scale!(SCALE);
1572    let zero = _mm_setzero_ps();
1573    let neg_one = _mm_set1_ps(-1.0);
1574    let offsets = offsets.as_i64x4();
1575    let slice = slice as *const i8;
1576    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1577}
1578
1579/// Returns values from `slice` at offsets determined by `offsets * scale`,
1580/// where
1581/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1582/// that position instead.
1583///
1584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1585#[inline]
1586#[target_feature(enable = "avx2")]
1587#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1588#[rustc_legacy_const_generics(4)]
1589#[stable(feature = "simd_x86", since = "1.27.0")]
1590pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1591    src: __m128,
1592    slice: *const f32,
1593    offsets: __m256i,
1594    mask: __m128,
1595) -> __m128 {
1596    static_assert_imm8_scale!(SCALE);
1597    let offsets = offsets.as_i64x4();
1598    let slice = slice as *const i8;
1599    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1600}
1601
1602/// Returns values from `slice` at offsets determined by `offsets * scale`,
1603/// where
1604/// `scale` should be 1, 2, 4 or 8.
1605///
1606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1607#[inline]
1608#[target_feature(enable = "avx2")]
1609#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1610#[rustc_legacy_const_generics(2)]
1611#[stable(feature = "simd_x86", since = "1.27.0")]
1612pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1613    slice: *const i64,
1614    offsets: __m128i,
1615) -> __m128i {
1616    static_assert_imm8_scale!(SCALE);
1617    let zero = i64x2::ZERO;
1618    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1619    let slice = slice as *const i8;
1620    let offsets = offsets.as_i64x2();
1621    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1622    transmute(r)
1623}
1624
1625/// Returns values from `slice` at offsets determined by `offsets * scale`,
1626/// where
1627/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1628/// that position instead.
1629///
1630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1631#[inline]
1632#[target_feature(enable = "avx2")]
1633#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1634#[rustc_legacy_const_generics(4)]
1635#[stable(feature = "simd_x86", since = "1.27.0")]
1636pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1637    src: __m128i,
1638    slice: *const i64,
1639    offsets: __m128i,
1640    mask: __m128i,
1641) -> __m128i {
1642    static_assert_imm8_scale!(SCALE);
1643    let src = src.as_i64x2();
1644    let mask = mask.as_i64x2();
1645    let offsets = offsets.as_i64x2();
1646    let slice = slice as *const i8;
1647    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1648    transmute(r)
1649}
1650
1651/// Returns values from `slice` at offsets determined by `offsets * scale`,
1652/// where
1653/// `scale` should be 1, 2, 4 or 8.
1654///
1655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1656#[inline]
1657#[target_feature(enable = "avx2")]
1658#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1659#[rustc_legacy_const_generics(2)]
1660#[stable(feature = "simd_x86", since = "1.27.0")]
1661pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1662    slice: *const i64,
1663    offsets: __m256i,
1664) -> __m256i {
1665    static_assert_imm8_scale!(SCALE);
1666    let zero = i64x4::ZERO;
1667    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1668    let slice = slice as *const i8;
1669    let offsets = offsets.as_i64x4();
1670    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1671    transmute(r)
1672}
1673
1674/// Returns values from `slice` at offsets determined by `offsets * scale`,
1675/// where
1676/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1677/// that position instead.
1678///
1679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1680#[inline]
1681#[target_feature(enable = "avx2")]
1682#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1683#[rustc_legacy_const_generics(4)]
1684#[stable(feature = "simd_x86", since = "1.27.0")]
1685pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1686    src: __m256i,
1687    slice: *const i64,
1688    offsets: __m256i,
1689    mask: __m256i,
1690) -> __m256i {
1691    static_assert_imm8_scale!(SCALE);
1692    let src = src.as_i64x4();
1693    let mask = mask.as_i64x4();
1694    let offsets = offsets.as_i64x4();
1695    let slice = slice as *const i8;
1696    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1697    transmute(r)
1698}
1699
1700/// Returns values from `slice` at offsets determined by `offsets * scale`,
1701/// where
1702/// `scale` should be 1, 2, 4 or 8.
1703///
1704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1705#[inline]
1706#[target_feature(enable = "avx2")]
1707#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1708#[rustc_legacy_const_generics(2)]
1709#[stable(feature = "simd_x86", since = "1.27.0")]
1710pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1711    static_assert_imm8_scale!(SCALE);
1712    let zero = _mm_setzero_pd();
1713    let neg_one = _mm_set1_pd(-1.0);
1714    let slice = slice as *const i8;
1715    let offsets = offsets.as_i64x2();
1716    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1717}
1718
1719/// Returns values from `slice` at offsets determined by `offsets * scale`,
1720/// where
1721/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1722/// that position instead.
1723///
1724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1725#[inline]
1726#[target_feature(enable = "avx2")]
1727#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1728#[rustc_legacy_const_generics(4)]
1729#[stable(feature = "simd_x86", since = "1.27.0")]
1730pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1731    src: __m128d,
1732    slice: *const f64,
1733    offsets: __m128i,
1734    mask: __m128d,
1735) -> __m128d {
1736    static_assert_imm8_scale!(SCALE);
1737    let slice = slice as *const i8;
1738    let offsets = offsets.as_i64x2();
1739    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1740}
1741
1742/// Returns values from `slice` at offsets determined by `offsets * scale`,
1743/// where
1744/// `scale` should be 1, 2, 4 or 8.
1745///
1746/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1747#[inline]
1748#[target_feature(enable = "avx2")]
1749#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1750#[rustc_legacy_const_generics(2)]
1751#[stable(feature = "simd_x86", since = "1.27.0")]
1752pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1753    slice: *const f64,
1754    offsets: __m256i,
1755) -> __m256d {
1756    static_assert_imm8_scale!(SCALE);
1757    let zero = _mm256_setzero_pd();
1758    let neg_one = _mm256_set1_pd(-1.0);
1759    let slice = slice as *const i8;
1760    let offsets = offsets.as_i64x4();
1761    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1762}
1763
1764/// Returns values from `slice` at offsets determined by `offsets * scale`,
1765/// where
1766/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1767/// that position instead.
1768///
1769/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1770#[inline]
1771#[target_feature(enable = "avx2")]
1772#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1773#[rustc_legacy_const_generics(4)]
1774#[stable(feature = "simd_x86", since = "1.27.0")]
1775pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1776    src: __m256d,
1777    slice: *const f64,
1778    offsets: __m256i,
1779    mask: __m256d,
1780) -> __m256d {
1781    static_assert_imm8_scale!(SCALE);
1782    let slice = slice as *const i8;
1783    let offsets = offsets.as_i64x4();
1784    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1785}
1786
1787/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1788/// location specified by `IMM1`.
1789///
1790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1791#[inline]
1792#[target_feature(enable = "avx2")]
1793#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1794#[rustc_legacy_const_generics(2)]
1795#[stable(feature = "simd_x86", since = "1.27.0")]
1796#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1797pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1798    static_assert_uimm_bits!(IMM1, 1);
1799    unsafe {
1800        let a = a.as_i64x4();
1801        let b = _mm256_castsi128_si256(b).as_i64x4();
1802        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1803        transmute(dst)
1804    }
1805}
1806
1807/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1808/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1809/// of intermediate 32-bit integers.
1810///
1811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1812#[inline]
1813#[target_feature(enable = "avx2")]
1814#[cfg_attr(test, assert_instr(vpmaddwd))]
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1817    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
1818    //
1819    // ```rust
1820    // #[target_feature(enable = "avx2")]
1821    // unsafe fn widening_add(mad: __m256i) -> __m256i {
1822    //     _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
1823    // }
1824    // ```
1825    //
1826    // If we implement this using generic vector intrinsics, the optimizer
1827    // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
1828    // For this reason, we use x86 intrinsics.
1829    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1830}
1831
1832/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1833/// corresponding signed 8-bit integer from `b`, producing intermediate
1834/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1835/// signed 16-bit integers
1836///
1837/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1838#[inline]
1839#[target_feature(enable = "avx2")]
1840#[cfg_attr(test, assert_instr(vpmaddubsw))]
1841#[stable(feature = "simd_x86", since = "1.27.0")]
1842pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1843    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
1844}
1845
1846/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1847/// (elements are zeroed out when the highest bit is not set in the
1848/// corresponding element).
1849///
1850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1851#[inline]
1852#[target_feature(enable = "avx2")]
1853#[cfg_attr(test, assert_instr(vpmaskmovd))]
1854#[stable(feature = "simd_x86", since = "1.27.0")]
1855#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1856pub const unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1857    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1858    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
1859}
1860
1861/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1862/// (elements are zeroed out when the highest bit is not set in the
1863/// corresponding element).
1864///
1865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1866#[inline]
1867#[target_feature(enable = "avx2")]
1868#[cfg_attr(test, assert_instr(vpmaskmovd))]
1869#[stable(feature = "simd_x86", since = "1.27.0")]
1870#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1871pub const unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1872    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1873    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
1874}
1875
1876/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1877/// (elements are zeroed out when the highest bit is not set in the
1878/// corresponding element).
1879///
1880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1881#[inline]
1882#[target_feature(enable = "avx2")]
1883#[cfg_attr(test, assert_instr(vpmaskmovq))]
1884#[stable(feature = "simd_x86", since = "1.27.0")]
1885#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1886pub const unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1887    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1888    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
1889}
1890
1891/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1892/// (elements are zeroed out when the highest bit is not set in the
1893/// corresponding element).
1894///
1895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1896#[inline]
1897#[target_feature(enable = "avx2")]
1898#[cfg_attr(test, assert_instr(vpmaskmovq))]
1899#[stable(feature = "simd_x86", since = "1.27.0")]
1900#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1901pub const unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1902    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1903    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
1904}
1905
1906/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1907/// using `mask` (elements are not stored when the highest bit is not set
1908/// in the corresponding element).
1909///
1910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1911#[inline]
1912#[target_feature(enable = "avx2")]
1913#[cfg_attr(test, assert_instr(vpmaskmovd))]
1914#[stable(feature = "simd_x86", since = "1.27.0")]
1915#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1916pub const unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1917    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1918    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
1919}
1920
1921/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1922/// using `mask` (elements are not stored when the highest bit is not set
1923/// in the corresponding element).
1924///
1925/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1926#[inline]
1927#[target_feature(enable = "avx2")]
1928#[cfg_attr(test, assert_instr(vpmaskmovd))]
1929#[stable(feature = "simd_x86", since = "1.27.0")]
1930#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1931pub const unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1932    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1933    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
1934}
1935
1936/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1937/// using `mask` (elements are not stored when the highest bit is not set
1938/// in the corresponding element).
1939///
1940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1941#[inline]
1942#[target_feature(enable = "avx2")]
1943#[cfg_attr(test, assert_instr(vpmaskmovq))]
1944#[stable(feature = "simd_x86", since = "1.27.0")]
1945#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1946pub const unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1947    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1948    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
1949}
1950
1951/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1952/// using `mask` (elements are not stored when the highest bit is not set
1953/// in the corresponding element).
1954///
1955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1956#[inline]
1957#[target_feature(enable = "avx2")]
1958#[cfg_attr(test, assert_instr(vpmaskmovq))]
1959#[stable(feature = "simd_x86", since = "1.27.0")]
1960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1961pub const unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1962    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1963    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
1964}
1965
1966/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1967/// maximum values.
1968///
1969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1970#[inline]
1971#[target_feature(enable = "avx2")]
1972#[cfg_attr(test, assert_instr(vpmaxsw))]
1973#[stable(feature = "simd_x86", since = "1.27.0")]
1974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1975pub const fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1976    unsafe { simd_imax(a.as_i16x16(), b.as_i16x16()).as_m256i() }
1977}
1978
1979/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1980/// maximum values.
1981///
1982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1983#[inline]
1984#[target_feature(enable = "avx2")]
1985#[cfg_attr(test, assert_instr(vpmaxsd))]
1986#[stable(feature = "simd_x86", since = "1.27.0")]
1987#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1988pub const fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1989    unsafe { simd_imax(a.as_i32x8(), b.as_i32x8()).as_m256i() }
1990}
1991
1992/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1993/// maximum values.
1994///
1995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1996#[inline]
1997#[target_feature(enable = "avx2")]
1998#[cfg_attr(test, assert_instr(vpmaxsb))]
1999#[stable(feature = "simd_x86", since = "1.27.0")]
2000#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2001pub const fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2002    unsafe { simd_imax(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2003}
2004
2005/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2006/// the packed maximum values.
2007///
2008/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
2009#[inline]
2010#[target_feature(enable = "avx2")]
2011#[cfg_attr(test, assert_instr(vpmaxuw))]
2012#[stable(feature = "simd_x86", since = "1.27.0")]
2013#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2014pub const fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2015    unsafe { simd_imax(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2016}
2017
2018/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2019/// the packed maximum values.
2020///
2021/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
2022#[inline]
2023#[target_feature(enable = "avx2")]
2024#[cfg_attr(test, assert_instr(vpmaxud))]
2025#[stable(feature = "simd_x86", since = "1.27.0")]
2026#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2027pub const fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2028    unsafe { simd_imax(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2029}
2030
2031/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2032/// the packed maximum values.
2033///
2034/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
2035#[inline]
2036#[target_feature(enable = "avx2")]
2037#[cfg_attr(test, assert_instr(vpmaxub))]
2038#[stable(feature = "simd_x86", since = "1.27.0")]
2039#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2040pub const fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2041    unsafe { simd_imax(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2042}
2043
2044/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2045/// minimum values.
2046///
2047/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
2048#[inline]
2049#[target_feature(enable = "avx2")]
2050#[cfg_attr(test, assert_instr(vpminsw))]
2051#[stable(feature = "simd_x86", since = "1.27.0")]
2052#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2053pub const fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2054    unsafe { simd_imin(a.as_i16x16(), b.as_i16x16()).as_m256i() }
2055}
2056
2057/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2058/// minimum values.
2059///
2060/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2061#[inline]
2062#[target_feature(enable = "avx2")]
2063#[cfg_attr(test, assert_instr(vpminsd))]
2064#[stable(feature = "simd_x86", since = "1.27.0")]
2065#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2066pub const fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2067    unsafe { simd_imin(a.as_i32x8(), b.as_i32x8()).as_m256i() }
2068}
2069
2070/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2071/// minimum values.
2072///
2073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2074#[inline]
2075#[target_feature(enable = "avx2")]
2076#[cfg_attr(test, assert_instr(vpminsb))]
2077#[stable(feature = "simd_x86", since = "1.27.0")]
2078#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2079pub const fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2080    unsafe { simd_imin(a.as_i8x32(), b.as_i8x32()).as_m256i() }
2081}
2082
2083/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2084/// the packed minimum values.
2085///
2086/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2087#[inline]
2088#[target_feature(enable = "avx2")]
2089#[cfg_attr(test, assert_instr(vpminuw))]
2090#[stable(feature = "simd_x86", since = "1.27.0")]
2091#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2092pub const fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2093    unsafe { simd_imin(a.as_u16x16(), b.as_u16x16()).as_m256i() }
2094}
2095
2096/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2097/// the packed minimum values.
2098///
2099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2100#[inline]
2101#[target_feature(enable = "avx2")]
2102#[cfg_attr(test, assert_instr(vpminud))]
2103#[stable(feature = "simd_x86", since = "1.27.0")]
2104#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2105pub const fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2106    unsafe { simd_imin(a.as_u32x8(), b.as_u32x8()).as_m256i() }
2107}
2108
2109/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2110/// the packed minimum values.
2111///
2112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2113#[inline]
2114#[target_feature(enable = "avx2")]
2115#[cfg_attr(test, assert_instr(vpminub))]
2116#[stable(feature = "simd_x86", since = "1.27.0")]
2117#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2118pub const fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2119    unsafe { simd_imin(a.as_u8x32(), b.as_u8x32()).as_m256i() }
2120}
2121
2122/// Creates mask from the most significant bit of each 8-bit element in `a`,
2123/// return the result.
2124///
2125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2126#[inline]
2127#[target_feature(enable = "avx2")]
2128#[cfg_attr(test, assert_instr(vpmovmskb))]
2129#[stable(feature = "simd_x86", since = "1.27.0")]
2130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2131pub const fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2132    unsafe {
2133        let z = i8x32::ZERO;
2134        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2135        simd_bitmask::<_, u32>(m) as i32
2136    }
2137}
2138
2139/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2140/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2141/// results in dst. Eight SADs are performed for each 128-bit lane using one
2142/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2143/// selected from `b` starting at on the offset specified in `imm8`. Eight
2144/// quadruplets are formed from sequential 8-bit integers selected from `a`
2145/// starting at the offset specified in `imm8`.
2146///
2147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2148#[inline]
2149#[target_feature(enable = "avx2")]
2150#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2151#[rustc_legacy_const_generics(2)]
2152#[stable(feature = "simd_x86", since = "1.27.0")]
2153pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2154    static_assert_uimm_bits!(IMM8, 8);
2155    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2156}
2157
2158/// Multiplies the low 32-bit integers from each packed 64-bit element in
2159/// `a` and `b`
2160///
2161/// Returns the 64-bit results.
2162///
2163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2164#[inline]
2165#[target_feature(enable = "avx2")]
2166#[cfg_attr(test, assert_instr(vpmuldq))]
2167#[stable(feature = "simd_x86", since = "1.27.0")]
2168#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2169pub const fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2170    unsafe {
2171        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2172        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2173        transmute(simd_mul(a, b))
2174    }
2175}
2176
2177/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2178/// element in `a` and `b`
2179///
2180/// Returns the unsigned 64-bit results.
2181///
2182/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2183#[inline]
2184#[target_feature(enable = "avx2")]
2185#[cfg_attr(test, assert_instr(vpmuludq))]
2186#[stable(feature = "simd_x86", since = "1.27.0")]
2187#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2188pub const fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2189    unsafe {
2190        let a = a.as_u64x4();
2191        let b = b.as_u64x4();
2192        let mask = u64x4::splat(u32::MAX as u64);
2193        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2194    }
2195}
2196
2197/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2198/// intermediate 32-bit integers and returning the high 16 bits of the
2199/// intermediate integers.
2200///
2201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2202#[inline]
2203#[target_feature(enable = "avx2")]
2204#[cfg_attr(test, assert_instr(vpmulhw))]
2205#[stable(feature = "simd_x86", since = "1.27.0")]
2206#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2207pub const fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2208    unsafe {
2209        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2210        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2211        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2212        transmute(simd_cast::<i32x16, i16x16>(r))
2213    }
2214}
2215
2216/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2217/// intermediate 32-bit integers and returning the high 16 bits of the
2218/// intermediate integers.
2219///
2220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2221#[inline]
2222#[target_feature(enable = "avx2")]
2223#[cfg_attr(test, assert_instr(vpmulhuw))]
2224#[stable(feature = "simd_x86", since = "1.27.0")]
2225#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2226pub const fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2227    unsafe {
2228        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2229        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2230        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2231        transmute(simd_cast::<u32x16, u16x16>(r))
2232    }
2233}
2234
2235/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2236/// intermediate 32-bit integers, and returns the low 16 bits of the
2237/// intermediate integers
2238///
2239/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2240#[inline]
2241#[target_feature(enable = "avx2")]
2242#[cfg_attr(test, assert_instr(vpmullw))]
2243#[stable(feature = "simd_x86", since = "1.27.0")]
2244#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2245pub const fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2246    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2247}
2248
2249/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2250/// intermediate 64-bit integers, and returns the low 32 bits of the
2251/// intermediate integers
2252///
2253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2254#[inline]
2255#[target_feature(enable = "avx2")]
2256#[cfg_attr(test, assert_instr(vpmulld))]
2257#[stable(feature = "simd_x86", since = "1.27.0")]
2258#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2259pub const fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2260    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2261}
2262
2263/// Multiplies packed 16-bit integers in `a` and `b`, producing
2264/// intermediate signed 32-bit integers. Truncate each intermediate
2265/// integer to the 18 most significant bits, round by adding 1, and
2266/// return bits `[16:1]`.
2267///
2268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2269#[inline]
2270#[target_feature(enable = "avx2")]
2271#[cfg_attr(test, assert_instr(vpmulhrsw))]
2272#[stable(feature = "simd_x86", since = "1.27.0")]
2273pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2274    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2275}
2276
2277/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2278/// and `b`
2279///
2280/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2281#[inline]
2282#[target_feature(enable = "avx2")]
2283#[cfg_attr(test, assert_instr(vorps))]
2284#[stable(feature = "simd_x86", since = "1.27.0")]
2285#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2286pub const fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2287    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2288}
2289
2290/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2291/// using signed saturation
2292///
2293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2294#[inline]
2295#[target_feature(enable = "avx2")]
2296#[cfg_attr(test, assert_instr(vpacksswb))]
2297#[stable(feature = "simd_x86", since = "1.27.0")]
2298pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2299    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2300}
2301
2302/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2303/// using signed saturation
2304///
2305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2306#[inline]
2307#[target_feature(enable = "avx2")]
2308#[cfg_attr(test, assert_instr(vpackssdw))]
2309#[stable(feature = "simd_x86", since = "1.27.0")]
2310pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2311    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2312}
2313
2314/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2315/// using unsigned saturation
2316///
2317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2318#[inline]
2319#[target_feature(enable = "avx2")]
2320#[cfg_attr(test, assert_instr(vpackuswb))]
2321#[stable(feature = "simd_x86", since = "1.27.0")]
2322pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2323    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2324}
2325
2326/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2327/// using unsigned saturation
2328///
2329/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2330#[inline]
2331#[target_feature(enable = "avx2")]
2332#[cfg_attr(test, assert_instr(vpackusdw))]
2333#[stable(feature = "simd_x86", since = "1.27.0")]
2334pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2335    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2336}
2337
2338/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2339///
2340/// The last 3 bits of each integer of `b` are used as addresses into the 8
2341/// integers of `a`.
2342///
2343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2344#[inline]
2345#[target_feature(enable = "avx2")]
2346#[cfg_attr(test, assert_instr(vpermps))]
2347#[stable(feature = "simd_x86", since = "1.27.0")]
2348pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2349    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2350}
2351
2352/// Permutes 64-bit integers from `a` using control mask `imm8`.
2353///
2354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2355#[inline]
2356#[target_feature(enable = "avx2")]
2357#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2358#[rustc_legacy_const_generics(1)]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2361pub const fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2362    static_assert_uimm_bits!(IMM8, 8);
2363    unsafe {
2364        let zero = i64x4::ZERO;
2365        let r: i64x4 = simd_shuffle!(
2366            a.as_i64x4(),
2367            zero,
2368            [
2369                IMM8 as u32 & 0b11,
2370                (IMM8 as u32 >> 2) & 0b11,
2371                (IMM8 as u32 >> 4) & 0b11,
2372                (IMM8 as u32 >> 6) & 0b11,
2373            ],
2374        );
2375        transmute(r)
2376    }
2377}
2378
2379/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2380///
2381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2382#[inline]
2383#[target_feature(enable = "avx2")]
2384#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2385#[rustc_legacy_const_generics(2)]
2386#[stable(feature = "simd_x86", since = "1.27.0")]
2387#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2388pub const fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2389    static_assert_uimm_bits!(IMM8, 8);
2390    _mm256_permute2f128_si256::<IMM8>(a, b)
2391}
2392
2393/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2394/// control in `imm8`.
2395///
2396/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2397#[inline]
2398#[target_feature(enable = "avx2")]
2399#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2400#[rustc_legacy_const_generics(1)]
2401#[stable(feature = "simd_x86", since = "1.27.0")]
2402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2403pub const fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2404    static_assert_uimm_bits!(IMM8, 8);
2405    unsafe {
2406        simd_shuffle!(
2407            a,
2408            _mm256_undefined_pd(),
2409            [
2410                IMM8 as u32 & 0b11,
2411                (IMM8 as u32 >> 2) & 0b11,
2412                (IMM8 as u32 >> 4) & 0b11,
2413                (IMM8 as u32 >> 6) & 0b11,
2414            ],
2415        )
2416    }
2417}
2418
2419/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2420/// the corresponding 32-bit integer index in `idx`.
2421///
2422/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2423#[inline]
2424#[target_feature(enable = "avx2")]
2425#[cfg_attr(test, assert_instr(vpermps))]
2426#[stable(feature = "simd_x86", since = "1.27.0")]
2427pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2428    unsafe { permps(a, idx.as_i32x8()) }
2429}
2430
2431/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2432/// and `b`, then horizontally sum each consecutive 8 differences to
2433/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2434/// integers in the low 16 bits of the 64-bit return value
2435///
2436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2437#[inline]
2438#[target_feature(enable = "avx2")]
2439#[cfg_attr(test, assert_instr(vpsadbw))]
2440#[stable(feature = "simd_x86", since = "1.27.0")]
2441pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2442    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2443}
2444
2445/// Shuffles bytes from `a` according to the content of `b`.
2446///
2447/// For each of the 128-bit low and high halves of the vectors, the last
2448/// 4 bits of each byte of `b` are used as addresses into the respective
2449/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2450///
2451/// In addition, if the highest significant bit of a byte of `b` is set, the
2452/// respective destination byte is set to 0.
2453///
2454/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2455/// equivalent to:
2456///
2457/// ```
2458/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2459///     let mut r = [0; 32];
2460///     for i in 0..16 {
2461///         // if the most significant bit of b is set,
2462///         // then the destination byte is set to 0.
2463///         if b[i] & 0x80 == 0u8 {
2464///             r[i] = a[(b[i] % 16) as usize];
2465///         }
2466///         if b[i + 16] & 0x80 == 0u8 {
2467///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2468///         }
2469///     }
2470///     r
2471/// }
2472/// ```
2473///
2474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2475#[inline]
2476#[target_feature(enable = "avx2")]
2477#[cfg_attr(test, assert_instr(vpshufb))]
2478#[stable(feature = "simd_x86", since = "1.27.0")]
2479pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2480    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2481}
2482
2483/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2484/// `imm8`.
2485///
2486/// ```rust
2487/// #[cfg(target_arch = "x86")]
2488/// use std::arch::x86::*;
2489/// #[cfg(target_arch = "x86_64")]
2490/// use std::arch::x86_64::*;
2491///
2492/// # fn main() {
2493/// #     if is_x86_feature_detected!("avx2") {
2494/// #         #[target_feature(enable = "avx2")]
2495/// #         unsafe fn worker() {
2496/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2497///
2498/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2499/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2500///
2501/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2502/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2503///
2504/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2505/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2506/// #         }
2507/// #         unsafe { worker(); }
2508/// #     }
2509/// # }
2510/// ```
2511///
2512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2513#[inline]
2514#[target_feature(enable = "avx2")]
2515#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2516#[rustc_legacy_const_generics(1)]
2517#[stable(feature = "simd_x86", since = "1.27.0")]
2518#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2519pub const fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2520    static_assert_uimm_bits!(MASK, 8);
2521    unsafe {
2522        let r: i32x8 = simd_shuffle!(
2523            a.as_i32x8(),
2524            a.as_i32x8(),
2525            [
2526                MASK as u32 & 0b11,
2527                (MASK as u32 >> 2) & 0b11,
2528                (MASK as u32 >> 4) & 0b11,
2529                (MASK as u32 >> 6) & 0b11,
2530                (MASK as u32 & 0b11) + 4,
2531                ((MASK as u32 >> 2) & 0b11) + 4,
2532                ((MASK as u32 >> 4) & 0b11) + 4,
2533                ((MASK as u32 >> 6) & 0b11) + 4,
2534            ],
2535        );
2536        transmute(r)
2537    }
2538}
2539
2540/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2541/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2542/// to the output.
2543///
2544/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2545#[inline]
2546#[target_feature(enable = "avx2")]
2547#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2548#[rustc_legacy_const_generics(1)]
2549#[stable(feature = "simd_x86", since = "1.27.0")]
2550#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2551pub const fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2552    static_assert_uimm_bits!(IMM8, 8);
2553    unsafe {
2554        let a = a.as_i16x16();
2555        let r: i16x16 = simd_shuffle!(
2556            a,
2557            a,
2558            [
2559                0,
2560                1,
2561                2,
2562                3,
2563                4 + (IMM8 as u32 & 0b11),
2564                4 + ((IMM8 as u32 >> 2) & 0b11),
2565                4 + ((IMM8 as u32 >> 4) & 0b11),
2566                4 + ((IMM8 as u32 >> 6) & 0b11),
2567                8,
2568                9,
2569                10,
2570                11,
2571                12 + (IMM8 as u32 & 0b11),
2572                12 + ((IMM8 as u32 >> 2) & 0b11),
2573                12 + ((IMM8 as u32 >> 4) & 0b11),
2574                12 + ((IMM8 as u32 >> 6) & 0b11),
2575            ],
2576        );
2577        transmute(r)
2578    }
2579}
2580
2581/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2582/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2583/// to the output.
2584///
2585/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2586#[inline]
2587#[target_feature(enable = "avx2")]
2588#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2589#[rustc_legacy_const_generics(1)]
2590#[stable(feature = "simd_x86", since = "1.27.0")]
2591#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2592pub const fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2593    static_assert_uimm_bits!(IMM8, 8);
2594    unsafe {
2595        let a = a.as_i16x16();
2596        let r: i16x16 = simd_shuffle!(
2597            a,
2598            a,
2599            [
2600                0 + (IMM8 as u32 & 0b11),
2601                0 + ((IMM8 as u32 >> 2) & 0b11),
2602                0 + ((IMM8 as u32 >> 4) & 0b11),
2603                0 + ((IMM8 as u32 >> 6) & 0b11),
2604                4,
2605                5,
2606                6,
2607                7,
2608                8 + (IMM8 as u32 & 0b11),
2609                8 + ((IMM8 as u32 >> 2) & 0b11),
2610                8 + ((IMM8 as u32 >> 4) & 0b11),
2611                8 + ((IMM8 as u32 >> 6) & 0b11),
2612                12,
2613                13,
2614                14,
2615                15,
2616            ],
2617        );
2618        transmute(r)
2619    }
2620}
2621
2622/// Negates packed 16-bit integers in `a` when the corresponding signed
2623/// 16-bit integer in `b` is negative, and returns the results.
2624/// Results are zeroed out when the corresponding element in `b` is zero.
2625///
2626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2627#[inline]
2628#[target_feature(enable = "avx2")]
2629#[cfg_attr(test, assert_instr(vpsignw))]
2630#[stable(feature = "simd_x86", since = "1.27.0")]
2631pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2632    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2633}
2634
2635/// Negates packed 32-bit integers in `a` when the corresponding signed
2636/// 32-bit integer in `b` is negative, and returns the results.
2637/// Results are zeroed out when the corresponding element in `b` is zero.
2638///
2639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2640#[inline]
2641#[target_feature(enable = "avx2")]
2642#[cfg_attr(test, assert_instr(vpsignd))]
2643#[stable(feature = "simd_x86", since = "1.27.0")]
2644pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2645    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2646}
2647
2648/// Negates packed 8-bit integers in `a` when the corresponding signed
2649/// 8-bit integer in `b` is negative, and returns the results.
2650/// Results are zeroed out when the corresponding element in `b` is zero.
2651///
2652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2653#[inline]
2654#[target_feature(enable = "avx2")]
2655#[cfg_attr(test, assert_instr(vpsignb))]
2656#[stable(feature = "simd_x86", since = "1.27.0")]
2657pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2658    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2659}
2660
2661/// Shifts packed 16-bit integers in `a` left by `count` while
2662/// shifting in zeros, and returns the result
2663///
2664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2665#[inline]
2666#[target_feature(enable = "avx2")]
2667#[cfg_attr(test, assert_instr(vpsllw))]
2668#[stable(feature = "simd_x86", since = "1.27.0")]
2669pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2670    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2671}
2672
2673/// Shifts packed 32-bit integers in `a` left by `count` while
2674/// shifting in zeros, and returns the result
2675///
2676/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2677#[inline]
2678#[target_feature(enable = "avx2")]
2679#[cfg_attr(test, assert_instr(vpslld))]
2680#[stable(feature = "simd_x86", since = "1.27.0")]
2681pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2682    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2683}
2684
2685/// Shifts packed 64-bit integers in `a` left by `count` while
2686/// shifting in zeros, and returns the result
2687///
2688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2689#[inline]
2690#[target_feature(enable = "avx2")]
2691#[cfg_attr(test, assert_instr(vpsllq))]
2692#[stable(feature = "simd_x86", since = "1.27.0")]
2693pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2694    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2695}
2696
2697/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2698/// shifting in zeros, return the results;
2699///
2700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2701#[inline]
2702#[target_feature(enable = "avx2")]
2703#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2704#[rustc_legacy_const_generics(1)]
2705#[stable(feature = "simd_x86", since = "1.27.0")]
2706#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2707pub const fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2708    static_assert_uimm_bits!(IMM8, 8);
2709    unsafe {
2710        if IMM8 >= 16 {
2711            _mm256_setzero_si256()
2712        } else {
2713            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2714        }
2715    }
2716}
2717
2718/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2719/// shifting in zeros, return the results;
2720///
2721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2722#[inline]
2723#[target_feature(enable = "avx2")]
2724#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2725#[rustc_legacy_const_generics(1)]
2726#[stable(feature = "simd_x86", since = "1.27.0")]
2727#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2728pub const fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2729    unsafe {
2730        static_assert_uimm_bits!(IMM8, 8);
2731        if IMM8 >= 32 {
2732            _mm256_setzero_si256()
2733        } else {
2734            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2735        }
2736    }
2737}
2738
2739/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2740/// shifting in zeros, return the results;
2741///
2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2743#[inline]
2744#[target_feature(enable = "avx2")]
2745#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2746#[rustc_legacy_const_generics(1)]
2747#[stable(feature = "simd_x86", since = "1.27.0")]
2748#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2749pub const fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2750    unsafe {
2751        static_assert_uimm_bits!(IMM8, 8);
2752        if IMM8 >= 64 {
2753            _mm256_setzero_si256()
2754        } else {
2755            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2756        }
2757    }
2758}
2759
2760/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2761///
2762/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2763#[inline]
2764#[target_feature(enable = "avx2")]
2765#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2766#[rustc_legacy_const_generics(1)]
2767#[stable(feature = "simd_x86", since = "1.27.0")]
2768#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2769pub const fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2770    static_assert_uimm_bits!(IMM8, 8);
2771    _mm256_bslli_epi128::<IMM8>(a)
2772}
2773
2774/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2775///
2776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2777#[inline]
2778#[target_feature(enable = "avx2")]
2779#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2780#[rustc_legacy_const_generics(1)]
2781#[stable(feature = "simd_x86", since = "1.27.0")]
2782#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2783pub const fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2784    static_assert_uimm_bits!(IMM8, 8);
2785    const fn mask(shift: i32, i: u32) -> u32 {
2786        let shift = shift as u32 & 0xff;
2787        if shift > 15 || i % 16 < shift {
2788            0
2789        } else {
2790            32 + (i - shift)
2791        }
2792    }
2793    unsafe {
2794        let a = a.as_i8x32();
2795        let r: i8x32 = simd_shuffle!(
2796            i8x32::ZERO,
2797            a,
2798            [
2799                mask(IMM8, 0),
2800                mask(IMM8, 1),
2801                mask(IMM8, 2),
2802                mask(IMM8, 3),
2803                mask(IMM8, 4),
2804                mask(IMM8, 5),
2805                mask(IMM8, 6),
2806                mask(IMM8, 7),
2807                mask(IMM8, 8),
2808                mask(IMM8, 9),
2809                mask(IMM8, 10),
2810                mask(IMM8, 11),
2811                mask(IMM8, 12),
2812                mask(IMM8, 13),
2813                mask(IMM8, 14),
2814                mask(IMM8, 15),
2815                mask(IMM8, 16),
2816                mask(IMM8, 17),
2817                mask(IMM8, 18),
2818                mask(IMM8, 19),
2819                mask(IMM8, 20),
2820                mask(IMM8, 21),
2821                mask(IMM8, 22),
2822                mask(IMM8, 23),
2823                mask(IMM8, 24),
2824                mask(IMM8, 25),
2825                mask(IMM8, 26),
2826                mask(IMM8, 27),
2827                mask(IMM8, 28),
2828                mask(IMM8, 29),
2829                mask(IMM8, 30),
2830                mask(IMM8, 31),
2831            ],
2832        );
2833        transmute(r)
2834    }
2835}
2836
2837/// Shifts packed 32-bit integers in `a` left by the amount
2838/// specified by the corresponding element in `count` while
2839/// shifting in zeros, and returns the result.
2840///
2841/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2842#[inline]
2843#[target_feature(enable = "avx2")]
2844#[cfg_attr(test, assert_instr(vpsllvd))]
2845#[stable(feature = "simd_x86", since = "1.27.0")]
2846#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2847pub const fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2848    unsafe {
2849        let count = count.as_u32x4();
2850        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2851        let count = simd_select(no_overflow, count, u32x4::ZERO);
2852        simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
2853    }
2854}
2855
2856/// Shifts packed 32-bit integers in `a` left by the amount
2857/// specified by the corresponding element in `count` while
2858/// shifting in zeros, and returns the result.
2859///
2860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2861#[inline]
2862#[target_feature(enable = "avx2")]
2863#[cfg_attr(test, assert_instr(vpsllvd))]
2864#[stable(feature = "simd_x86", since = "1.27.0")]
2865#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2866pub const fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2867    unsafe {
2868        let count = count.as_u32x8();
2869        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2870        let count = simd_select(no_overflow, count, u32x8::ZERO);
2871        simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
2872    }
2873}
2874
2875/// Shifts packed 64-bit integers in `a` left by the amount
2876/// specified by the corresponding element in `count` while
2877/// shifting in zeros, and returns the result.
2878///
2879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2880#[inline]
2881#[target_feature(enable = "avx2")]
2882#[cfg_attr(test, assert_instr(vpsllvq))]
2883#[stable(feature = "simd_x86", since = "1.27.0")]
2884#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2885pub const fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2886    unsafe {
2887        let count = count.as_u64x2();
2888        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
2889        let count = simd_select(no_overflow, count, u64x2::ZERO);
2890        simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
2891    }
2892}
2893
2894/// Shifts packed 64-bit integers in `a` left by the amount
2895/// specified by the corresponding element in `count` while
2896/// shifting in zeros, and returns the result.
2897///
2898/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2899#[inline]
2900#[target_feature(enable = "avx2")]
2901#[cfg_attr(test, assert_instr(vpsllvq))]
2902#[stable(feature = "simd_x86", since = "1.27.0")]
2903#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2904pub const fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2905    unsafe {
2906        let count = count.as_u64x4();
2907        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
2908        let count = simd_select(no_overflow, count, u64x4::ZERO);
2909        simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
2910    }
2911}
2912
2913/// Shifts packed 16-bit integers in `a` right by `count` while
2914/// shifting in sign bits.
2915///
2916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2917#[inline]
2918#[target_feature(enable = "avx2")]
2919#[cfg_attr(test, assert_instr(vpsraw))]
2920#[stable(feature = "simd_x86", since = "1.27.0")]
2921pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2922    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2923}
2924
2925/// Shifts packed 32-bit integers in `a` right by `count` while
2926/// shifting in sign bits.
2927///
2928/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2929#[inline]
2930#[target_feature(enable = "avx2")]
2931#[cfg_attr(test, assert_instr(vpsrad))]
2932#[stable(feature = "simd_x86", since = "1.27.0")]
2933pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2934    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2935}
2936
2937/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2938/// shifting in sign bits.
2939///
2940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2941#[inline]
2942#[target_feature(enable = "avx2")]
2943#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2944#[rustc_legacy_const_generics(1)]
2945#[stable(feature = "simd_x86", since = "1.27.0")]
2946#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2947pub const fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2948    static_assert_uimm_bits!(IMM8, 8);
2949    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2950}
2951
2952/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2953/// shifting in sign bits.
2954///
2955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2956#[inline]
2957#[target_feature(enable = "avx2")]
2958#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2959#[rustc_legacy_const_generics(1)]
2960#[stable(feature = "simd_x86", since = "1.27.0")]
2961#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2962pub const fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2963    static_assert_uimm_bits!(IMM8, 8);
2964    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2965}
2966
2967/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2968/// corresponding element in `count` while shifting in sign bits.
2969///
2970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2971#[inline]
2972#[target_feature(enable = "avx2")]
2973#[cfg_attr(test, assert_instr(vpsravd))]
2974#[stable(feature = "simd_x86", since = "1.27.0")]
2975#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2976pub const fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2977    unsafe {
2978        let count = count.as_u32x4();
2979        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2980        let count = simd_select(no_overflow, transmute(count), i32x4::splat(31));
2981        simd_shr(a.as_i32x4(), count).as_m128i()
2982    }
2983}
2984
2985/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2986/// corresponding element in `count` while shifting in sign bits.
2987///
2988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2989#[inline]
2990#[target_feature(enable = "avx2")]
2991#[cfg_attr(test, assert_instr(vpsravd))]
2992#[stable(feature = "simd_x86", since = "1.27.0")]
2993#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2994pub const fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2995    unsafe {
2996        let count = count.as_u32x8();
2997        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2998        let count = simd_select(no_overflow, transmute(count), i32x8::splat(31));
2999        simd_shr(a.as_i32x8(), count).as_m256i()
3000    }
3001}
3002
3003/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3004///
3005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
3006#[inline]
3007#[target_feature(enable = "avx2")]
3008#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3009#[rustc_legacy_const_generics(1)]
3010#[stable(feature = "simd_x86", since = "1.27.0")]
3011#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3012pub const fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
3013    static_assert_uimm_bits!(IMM8, 8);
3014    _mm256_bsrli_epi128::<IMM8>(a)
3015}
3016
3017/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3018///
3019/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
3020#[inline]
3021#[target_feature(enable = "avx2")]
3022#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
3023#[rustc_legacy_const_generics(1)]
3024#[stable(feature = "simd_x86", since = "1.27.0")]
3025#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3026pub const fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
3027    static_assert_uimm_bits!(IMM8, 8);
3028    const fn mask(shift: i32, i: u32) -> u32 {
3029        let shift = shift as u32 & 0xff;
3030        if shift > 15 || (15 - (i % 16)) < shift {
3031            0
3032        } else {
3033            32 + (i + shift)
3034        }
3035    }
3036    unsafe {
3037        let a = a.as_i8x32();
3038        let r: i8x32 = simd_shuffle!(
3039            i8x32::ZERO,
3040            a,
3041            [
3042                mask(IMM8, 0),
3043                mask(IMM8, 1),
3044                mask(IMM8, 2),
3045                mask(IMM8, 3),
3046                mask(IMM8, 4),
3047                mask(IMM8, 5),
3048                mask(IMM8, 6),
3049                mask(IMM8, 7),
3050                mask(IMM8, 8),
3051                mask(IMM8, 9),
3052                mask(IMM8, 10),
3053                mask(IMM8, 11),
3054                mask(IMM8, 12),
3055                mask(IMM8, 13),
3056                mask(IMM8, 14),
3057                mask(IMM8, 15),
3058                mask(IMM8, 16),
3059                mask(IMM8, 17),
3060                mask(IMM8, 18),
3061                mask(IMM8, 19),
3062                mask(IMM8, 20),
3063                mask(IMM8, 21),
3064                mask(IMM8, 22),
3065                mask(IMM8, 23),
3066                mask(IMM8, 24),
3067                mask(IMM8, 25),
3068                mask(IMM8, 26),
3069                mask(IMM8, 27),
3070                mask(IMM8, 28),
3071                mask(IMM8, 29),
3072                mask(IMM8, 30),
3073                mask(IMM8, 31),
3074            ],
3075        );
3076        transmute(r)
3077    }
3078}
3079
3080/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3081/// zeros.
3082///
3083/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3084#[inline]
3085#[target_feature(enable = "avx2")]
3086#[cfg_attr(test, assert_instr(vpsrlw))]
3087#[stable(feature = "simd_x86", since = "1.27.0")]
3088pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3089    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3090}
3091
3092/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3093/// zeros.
3094///
3095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3096#[inline]
3097#[target_feature(enable = "avx2")]
3098#[cfg_attr(test, assert_instr(vpsrld))]
3099#[stable(feature = "simd_x86", since = "1.27.0")]
3100pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3101    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3102}
3103
3104/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3105/// zeros.
3106///
3107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3108#[inline]
3109#[target_feature(enable = "avx2")]
3110#[cfg_attr(test, assert_instr(vpsrlq))]
3111#[stable(feature = "simd_x86", since = "1.27.0")]
3112pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3113    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3114}
3115
3116/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3117/// zeros
3118///
3119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3120#[inline]
3121#[target_feature(enable = "avx2")]
3122#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3123#[rustc_legacy_const_generics(1)]
3124#[stable(feature = "simd_x86", since = "1.27.0")]
3125#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3126pub const fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3127    static_assert_uimm_bits!(IMM8, 8);
3128    unsafe {
3129        if IMM8 >= 16 {
3130            _mm256_setzero_si256()
3131        } else {
3132            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3133        }
3134    }
3135}
3136
3137/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3138/// zeros
3139///
3140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3141#[inline]
3142#[target_feature(enable = "avx2")]
3143#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3144#[rustc_legacy_const_generics(1)]
3145#[stable(feature = "simd_x86", since = "1.27.0")]
3146#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3147pub const fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3148    static_assert_uimm_bits!(IMM8, 8);
3149    unsafe {
3150        if IMM8 >= 32 {
3151            _mm256_setzero_si256()
3152        } else {
3153            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3154        }
3155    }
3156}
3157
3158/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3159/// zeros
3160///
3161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3162#[inline]
3163#[target_feature(enable = "avx2")]
3164#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3165#[rustc_legacy_const_generics(1)]
3166#[stable(feature = "simd_x86", since = "1.27.0")]
3167#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3168pub const fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3169    static_assert_uimm_bits!(IMM8, 8);
3170    unsafe {
3171        if IMM8 >= 64 {
3172            _mm256_setzero_si256()
3173        } else {
3174            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3175        }
3176    }
3177}
3178
3179/// Shifts packed 32-bit integers in `a` right by the amount specified by
3180/// the corresponding element in `count` while shifting in zeros,
3181///
3182/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3183#[inline]
3184#[target_feature(enable = "avx2")]
3185#[cfg_attr(test, assert_instr(vpsrlvd))]
3186#[stable(feature = "simd_x86", since = "1.27.0")]
3187#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3188pub const fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3189    unsafe {
3190        let count = count.as_u32x4();
3191        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3192        let count = simd_select(no_overflow, count, u32x4::ZERO);
3193        simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
3194    }
3195}
3196
3197/// Shifts packed 32-bit integers in `a` right by the amount specified by
3198/// the corresponding element in `count` while shifting in zeros,
3199///
3200/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3201#[inline]
3202#[target_feature(enable = "avx2")]
3203#[cfg_attr(test, assert_instr(vpsrlvd))]
3204#[stable(feature = "simd_x86", since = "1.27.0")]
3205#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3206pub const fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3207    unsafe {
3208        let count = count.as_u32x8();
3209        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3210        let count = simd_select(no_overflow, count, u32x8::ZERO);
3211        simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
3212    }
3213}
3214
3215/// Shifts packed 64-bit integers in `a` right by the amount specified by
3216/// the corresponding element in `count` while shifting in zeros,
3217///
3218/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3219#[inline]
3220#[target_feature(enable = "avx2")]
3221#[cfg_attr(test, assert_instr(vpsrlvq))]
3222#[stable(feature = "simd_x86", since = "1.27.0")]
3223#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3224pub const fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3225    unsafe {
3226        let count = count.as_u64x2();
3227        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3228        let count = simd_select(no_overflow, count, u64x2::ZERO);
3229        simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3230    }
3231}
3232
3233/// Shifts packed 64-bit integers in `a` right by the amount specified by
3234/// the corresponding element in `count` while shifting in zeros,
3235///
3236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3237#[inline]
3238#[target_feature(enable = "avx2")]
3239#[cfg_attr(test, assert_instr(vpsrlvq))]
3240#[stable(feature = "simd_x86", since = "1.27.0")]
3241#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3242pub const fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3243    unsafe {
3244        let count = count.as_u64x4();
3245        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3246        let count = simd_select(no_overflow, count, u64x4::ZERO);
3247        simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3248    }
3249}
3250
3251/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3252/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3253/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3254///
3255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3256#[inline]
3257#[target_feature(enable = "avx2")]
3258#[cfg_attr(test, assert_instr(vmovntdqa))]
3259#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3260pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3261    let dst: __m256i;
3262    crate::arch::asm!(
3263        vpl!("vmovntdqa {a}"),
3264        a = out(ymm_reg) dst,
3265        p = in(reg) mem_addr,
3266        options(pure, readonly, nostack, preserves_flags),
3267    );
3268    dst
3269}
3270
3271/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3272///
3273/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3274#[inline]
3275#[target_feature(enable = "avx2")]
3276#[cfg_attr(test, assert_instr(vpsubw))]
3277#[stable(feature = "simd_x86", since = "1.27.0")]
3278#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3279pub const fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3280    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3281}
3282
3283/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3284///
3285/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3286#[inline]
3287#[target_feature(enable = "avx2")]
3288#[cfg_attr(test, assert_instr(vpsubd))]
3289#[stable(feature = "simd_x86", since = "1.27.0")]
3290#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3291pub const fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3292    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3293}
3294
3295/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3296///
3297/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3298#[inline]
3299#[target_feature(enable = "avx2")]
3300#[cfg_attr(test, assert_instr(vpsubq))]
3301#[stable(feature = "simd_x86", since = "1.27.0")]
3302#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3303pub const fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3304    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3305}
3306
3307/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3308///
3309/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3310#[inline]
3311#[target_feature(enable = "avx2")]
3312#[cfg_attr(test, assert_instr(vpsubb))]
3313#[stable(feature = "simd_x86", since = "1.27.0")]
3314#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3315pub const fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3316    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3317}
3318
3319/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3320/// `a` using saturation.
3321///
3322/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3323#[inline]
3324#[target_feature(enable = "avx2")]
3325#[cfg_attr(test, assert_instr(vpsubsw))]
3326#[stable(feature = "simd_x86", since = "1.27.0")]
3327#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3328pub const fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3329    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3330}
3331
3332/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3333/// `a` using saturation.
3334///
3335/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3336#[inline]
3337#[target_feature(enable = "avx2")]
3338#[cfg_attr(test, assert_instr(vpsubsb))]
3339#[stable(feature = "simd_x86", since = "1.27.0")]
3340#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3341pub const fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3342    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3343}
3344
3345/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3346/// integers in `a` using saturation.
3347///
3348/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3349#[inline]
3350#[target_feature(enable = "avx2")]
3351#[cfg_attr(test, assert_instr(vpsubusw))]
3352#[stable(feature = "simd_x86", since = "1.27.0")]
3353#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3354pub const fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3355    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3356}
3357
3358/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3359/// integers in `a` using saturation.
3360///
3361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3362#[inline]
3363#[target_feature(enable = "avx2")]
3364#[cfg_attr(test, assert_instr(vpsubusb))]
3365#[stable(feature = "simd_x86", since = "1.27.0")]
3366#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3367pub const fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3368    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3369}
3370
3371/// Unpacks and interleave 8-bit integers from the high half of each
3372/// 128-bit lane in `a` and `b`.
3373///
3374/// ```rust
3375/// #[cfg(target_arch = "x86")]
3376/// use std::arch::x86::*;
3377/// #[cfg(target_arch = "x86_64")]
3378/// use std::arch::x86_64::*;
3379///
3380/// # fn main() {
3381/// #     if is_x86_feature_detected!("avx2") {
3382/// #         #[target_feature(enable = "avx2")]
3383/// #         unsafe fn worker() {
3384/// let a = _mm256_setr_epi8(
3385///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3386///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3387/// );
3388/// let b = _mm256_setr_epi8(
3389///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3390///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3391///     -30, -31,
3392/// );
3393///
3394/// let c = _mm256_unpackhi_epi8(a, b);
3395///
3396/// let expected = _mm256_setr_epi8(
3397///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3398///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3399///     -31,
3400/// );
3401/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3402///
3403/// #         }
3404/// #         unsafe { worker(); }
3405/// #     }
3406/// # }
3407/// ```
3408///
3409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3410#[inline]
3411#[target_feature(enable = "avx2")]
3412#[cfg_attr(test, assert_instr(vpunpckhbw))]
3413#[stable(feature = "simd_x86", since = "1.27.0")]
3414#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3415pub const fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3416    unsafe {
3417        #[rustfmt::skip]
3418        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3419                8, 40, 9, 41, 10, 42, 11, 43,
3420                12, 44, 13, 45, 14, 46, 15, 47,
3421                24, 56, 25, 57, 26, 58, 27, 59,
3422                28, 60, 29, 61, 30, 62, 31, 63,
3423        ]);
3424        transmute(r)
3425    }
3426}
3427
3428/// Unpacks and interleave 8-bit integers from the low half of each
3429/// 128-bit lane of `a` and `b`.
3430///
3431/// ```rust
3432/// #[cfg(target_arch = "x86")]
3433/// use std::arch::x86::*;
3434/// #[cfg(target_arch = "x86_64")]
3435/// use std::arch::x86_64::*;
3436///
3437/// # fn main() {
3438/// #     if is_x86_feature_detected!("avx2") {
3439/// #         #[target_feature(enable = "avx2")]
3440/// #         unsafe fn worker() {
3441/// let a = _mm256_setr_epi8(
3442///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3443///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3444/// );
3445/// let b = _mm256_setr_epi8(
3446///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3447///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3448///     -30, -31,
3449/// );
3450///
3451/// let c = _mm256_unpacklo_epi8(a, b);
3452///
3453/// let expected = _mm256_setr_epi8(
3454///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3455///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3456/// );
3457/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3458///
3459/// #         }
3460/// #         unsafe { worker(); }
3461/// #     }
3462/// # }
3463/// ```
3464///
3465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3466#[inline]
3467#[target_feature(enable = "avx2")]
3468#[cfg_attr(test, assert_instr(vpunpcklbw))]
3469#[stable(feature = "simd_x86", since = "1.27.0")]
3470#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3471pub const fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3472    unsafe {
3473        #[rustfmt::skip]
3474        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3475            0, 32, 1, 33, 2, 34, 3, 35,
3476            4, 36, 5, 37, 6, 38, 7, 39,
3477            16, 48, 17, 49, 18, 50, 19, 51,
3478            20, 52, 21, 53, 22, 54, 23, 55,
3479        ]);
3480        transmute(r)
3481    }
3482}
3483
3484/// Unpacks and interleave 16-bit integers from the high half of each
3485/// 128-bit lane of `a` and `b`.
3486///
3487/// ```rust
3488/// #[cfg(target_arch = "x86")]
3489/// use std::arch::x86::*;
3490/// #[cfg(target_arch = "x86_64")]
3491/// use std::arch::x86_64::*;
3492///
3493/// # fn main() {
3494/// #     if is_x86_feature_detected!("avx2") {
3495/// #         #[target_feature(enable = "avx2")]
3496/// #         unsafe fn worker() {
3497/// let a = _mm256_setr_epi16(
3498///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3499/// );
3500/// let b = _mm256_setr_epi16(
3501///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3502/// );
3503///
3504/// let c = _mm256_unpackhi_epi16(a, b);
3505///
3506/// let expected = _mm256_setr_epi16(
3507///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3508/// );
3509/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3510///
3511/// #         }
3512/// #         unsafe { worker(); }
3513/// #     }
3514/// # }
3515/// ```
3516///
3517/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3518#[inline]
3519#[target_feature(enable = "avx2")]
3520#[cfg_attr(test, assert_instr(vpunpckhwd))]
3521#[stable(feature = "simd_x86", since = "1.27.0")]
3522#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3523pub const fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3524    unsafe {
3525        let r: i16x16 = simd_shuffle!(
3526            a.as_i16x16(),
3527            b.as_i16x16(),
3528            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3529        );
3530        transmute(r)
3531    }
3532}
3533
3534/// Unpacks and interleave 16-bit integers from the low half of each
3535/// 128-bit lane of `a` and `b`.
3536///
3537/// ```rust
3538/// #[cfg(target_arch = "x86")]
3539/// use std::arch::x86::*;
3540/// #[cfg(target_arch = "x86_64")]
3541/// use std::arch::x86_64::*;
3542///
3543/// # fn main() {
3544/// #     if is_x86_feature_detected!("avx2") {
3545/// #         #[target_feature(enable = "avx2")]
3546/// #         unsafe fn worker() {
3547///
3548/// let a = _mm256_setr_epi16(
3549///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3550/// );
3551/// let b = _mm256_setr_epi16(
3552///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3553/// );
3554///
3555/// let c = _mm256_unpacklo_epi16(a, b);
3556///
3557/// let expected = _mm256_setr_epi16(
3558///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3559/// );
3560/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3561///
3562/// #         }
3563/// #         unsafe { worker(); }
3564/// #     }
3565/// # }
3566/// ```
3567///
3568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3569#[inline]
3570#[target_feature(enable = "avx2")]
3571#[cfg_attr(test, assert_instr(vpunpcklwd))]
3572#[stable(feature = "simd_x86", since = "1.27.0")]
3573#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3574pub const fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3575    unsafe {
3576        let r: i16x16 = simd_shuffle!(
3577            a.as_i16x16(),
3578            b.as_i16x16(),
3579            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3580        );
3581        transmute(r)
3582    }
3583}
3584
3585/// Unpacks and interleave 32-bit integers from the high half of each
3586/// 128-bit lane of `a` and `b`.
3587///
3588/// ```rust
3589/// #[cfg(target_arch = "x86")]
3590/// use std::arch::x86::*;
3591/// #[cfg(target_arch = "x86_64")]
3592/// use std::arch::x86_64::*;
3593///
3594/// # fn main() {
3595/// #     if is_x86_feature_detected!("avx2") {
3596/// #         #[target_feature(enable = "avx2")]
3597/// #         unsafe fn worker() {
3598/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3599/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3600///
3601/// let c = _mm256_unpackhi_epi32(a, b);
3602///
3603/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3604/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3605///
3606/// #         }
3607/// #         unsafe { worker(); }
3608/// #     }
3609/// # }
3610/// ```
3611///
3612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3613#[inline]
3614#[target_feature(enable = "avx2")]
3615#[cfg_attr(test, assert_instr(vunpckhps))]
3616#[stable(feature = "simd_x86", since = "1.27.0")]
3617#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3618pub const fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3619    unsafe {
3620        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3621        transmute(r)
3622    }
3623}
3624
3625/// Unpacks and interleave 32-bit integers from the low half of each
3626/// 128-bit lane of `a` and `b`.
3627///
3628/// ```rust
3629/// #[cfg(target_arch = "x86")]
3630/// use std::arch::x86::*;
3631/// #[cfg(target_arch = "x86_64")]
3632/// use std::arch::x86_64::*;
3633///
3634/// # fn main() {
3635/// #     if is_x86_feature_detected!("avx2") {
3636/// #         #[target_feature(enable = "avx2")]
3637/// #         unsafe fn worker() {
3638/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3639/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3640///
3641/// let c = _mm256_unpacklo_epi32(a, b);
3642///
3643/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3644/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3645///
3646/// #         }
3647/// #         unsafe { worker(); }
3648/// #     }
3649/// # }
3650/// ```
3651///
3652/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3653#[inline]
3654#[target_feature(enable = "avx2")]
3655#[cfg_attr(test, assert_instr(vunpcklps))]
3656#[stable(feature = "simd_x86", since = "1.27.0")]
3657#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3658pub const fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3659    unsafe {
3660        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3661        transmute(r)
3662    }
3663}
3664
3665/// Unpacks and interleave 64-bit integers from the high half of each
3666/// 128-bit lane of `a` and `b`.
3667///
3668/// ```rust
3669/// #[cfg(target_arch = "x86")]
3670/// use std::arch::x86::*;
3671/// #[cfg(target_arch = "x86_64")]
3672/// use std::arch::x86_64::*;
3673///
3674/// # fn main() {
3675/// #     if is_x86_feature_detected!("avx2") {
3676/// #         #[target_feature(enable = "avx2")]
3677/// #         unsafe fn worker() {
3678/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3679/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3680///
3681/// let c = _mm256_unpackhi_epi64(a, b);
3682///
3683/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3684/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3685///
3686/// #         }
3687/// #         unsafe { worker(); }
3688/// #     }
3689/// # }
3690/// ```
3691///
3692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3693#[inline]
3694#[target_feature(enable = "avx2")]
3695#[cfg_attr(test, assert_instr(vunpckhpd))]
3696#[stable(feature = "simd_x86", since = "1.27.0")]
3697#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3698pub const fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3699    unsafe {
3700        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3701        transmute(r)
3702    }
3703}
3704
3705/// Unpacks and interleave 64-bit integers from the low half of each
3706/// 128-bit lane of `a` and `b`.
3707///
3708/// ```rust
3709/// #[cfg(target_arch = "x86")]
3710/// use std::arch::x86::*;
3711/// #[cfg(target_arch = "x86_64")]
3712/// use std::arch::x86_64::*;
3713///
3714/// # fn main() {
3715/// #     if is_x86_feature_detected!("avx2") {
3716/// #         #[target_feature(enable = "avx2")]
3717/// #         unsafe fn worker() {
3718/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3719/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3720///
3721/// let c = _mm256_unpacklo_epi64(a, b);
3722///
3723/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3724/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3725///
3726/// #         }
3727/// #         unsafe { worker(); }
3728/// #     }
3729/// # }
3730/// ```
3731///
3732/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3733#[inline]
3734#[target_feature(enable = "avx2")]
3735#[cfg_attr(test, assert_instr(vunpcklpd))]
3736#[stable(feature = "simd_x86", since = "1.27.0")]
3737#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3738pub const fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3739    unsafe {
3740        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3741        transmute(r)
3742    }
3743}
3744
3745/// Computes the bitwise XOR of 256 bits (representing integer data)
3746/// in `a` and `b`
3747///
3748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3749#[inline]
3750#[target_feature(enable = "avx2")]
3751#[cfg_attr(test, assert_instr(vxorps))]
3752#[stable(feature = "simd_x86", since = "1.27.0")]
3753#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3754pub const fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3755    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3756}
3757
3758/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3759/// integer containing the zero-extended integer data.
3760///
3761/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3762///
3763/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3764#[inline]
3765#[target_feature(enable = "avx2")]
3766// This intrinsic has no corresponding instruction.
3767#[rustc_legacy_const_generics(1)]
3768#[stable(feature = "simd_x86", since = "1.27.0")]
3769#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3770pub const fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3771    static_assert_uimm_bits!(INDEX, 5);
3772    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3773}
3774
3775/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3776/// integer containing the zero-extended integer data.
3777///
3778/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3779///
3780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3781#[inline]
3782#[target_feature(enable = "avx2")]
3783// This intrinsic has no corresponding instruction.
3784#[rustc_legacy_const_generics(1)]
3785#[stable(feature = "simd_x86", since = "1.27.0")]
3786#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
3787pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3788    static_assert_uimm_bits!(INDEX, 4);
3789    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3790}
3791
3792#[allow(improper_ctypes)]
3793unsafe extern "C" {
3794    #[link_name = "llvm.x86.avx2.phadd.sw"]
3795    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3796    #[link_name = "llvm.x86.avx2.phsub.sw"]
3797    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3798    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3799    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3800    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3801    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
3802    #[link_name = "llvm.x86.avx2.mpsadbw"]
3803    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3804    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3805    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3806    #[link_name = "llvm.x86.avx2.packsswb"]
3807    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3808    #[link_name = "llvm.x86.avx2.packssdw"]
3809    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3810    #[link_name = "llvm.x86.avx2.packuswb"]
3811    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3812    #[link_name = "llvm.x86.avx2.packusdw"]
3813    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3814    #[link_name = "llvm.x86.avx2.psad.bw"]
3815    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3816    #[link_name = "llvm.x86.avx2.psign.b"]
3817    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3818    #[link_name = "llvm.x86.avx2.psign.w"]
3819    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3820    #[link_name = "llvm.x86.avx2.psign.d"]
3821    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3822    #[link_name = "llvm.x86.avx2.psll.w"]
3823    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3824    #[link_name = "llvm.x86.avx2.psll.d"]
3825    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3826    #[link_name = "llvm.x86.avx2.psll.q"]
3827    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3828    #[link_name = "llvm.x86.avx2.psra.w"]
3829    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3830    #[link_name = "llvm.x86.avx2.psra.d"]
3831    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3832    #[link_name = "llvm.x86.avx2.psrl.w"]
3833    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3834    #[link_name = "llvm.x86.avx2.psrl.d"]
3835    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3836    #[link_name = "llvm.x86.avx2.psrl.q"]
3837    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3838    #[link_name = "llvm.x86.avx2.pshuf.b"]
3839    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3840    #[link_name = "llvm.x86.avx2.permd"]
3841    fn permd(a: u32x8, b: u32x8) -> u32x8;
3842    #[link_name = "llvm.x86.avx2.permps"]
3843    fn permps(a: __m256, b: i32x8) -> __m256;
3844    #[link_name = "llvm.x86.avx2.gather.d.d"]
3845    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3846    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3847    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3848    #[link_name = "llvm.x86.avx2.gather.d.q"]
3849    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3850    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3851    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3852    #[link_name = "llvm.x86.avx2.gather.q.d"]
3853    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3854    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3855    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3856    #[link_name = "llvm.x86.avx2.gather.q.q"]
3857    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3858    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3859    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3860    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3861    fn pgatherdpd(
3862        src: __m128d,
3863        slice: *const i8,
3864        offsets: i32x4,
3865        mask: __m128d,
3866        scale: i8,
3867    ) -> __m128d;
3868    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3869    fn vpgatherdpd(
3870        src: __m256d,
3871        slice: *const i8,
3872        offsets: i32x4,
3873        mask: __m256d,
3874        scale: i8,
3875    ) -> __m256d;
3876    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3877    fn pgatherqpd(
3878        src: __m128d,
3879        slice: *const i8,
3880        offsets: i64x2,
3881        mask: __m128d,
3882        scale: i8,
3883    ) -> __m128d;
3884    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3885    fn vpgatherqpd(
3886        src: __m256d,
3887        slice: *const i8,
3888        offsets: i64x4,
3889        mask: __m256d,
3890        scale: i8,
3891    ) -> __m256d;
3892    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3893    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3894    -> __m128;
3895    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3896    fn vpgatherdps(
3897        src: __m256,
3898        slice: *const i8,
3899        offsets: i32x8,
3900        mask: __m256,
3901        scale: i8,
3902    ) -> __m256;
3903    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3904    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3905    -> __m128;
3906    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3907    fn vpgatherqps(
3908        src: __m128,
3909        slice: *const i8,
3910        offsets: i64x4,
3911        mask: __m128,
3912        scale: i8,
3913    ) -> __m128;
3914}
3915
3916#[cfg(test)]
3917mod tests {
3918    use crate::core_arch::assert_eq_const as assert_eq;
3919
3920    use stdarch_test::simd_test;
3921
3922    use crate::core_arch::x86::*;
3923
3924    #[simd_test(enable = "avx2")]
3925    const fn test_mm256_abs_epi32() {
3926        #[rustfmt::skip]
3927        let a = _mm256_setr_epi32(
3928            0, 1, -1, i32::MAX,
3929            i32::MIN, 100, -100, -32,
3930        );
3931        let r = _mm256_abs_epi32(a);
3932        #[rustfmt::skip]
3933        let e = _mm256_setr_epi32(
3934            0, 1, 1, i32::MAX,
3935            i32::MAX.wrapping_add(1), 100, 100, 32,
3936        );
3937        assert_eq_m256i(r, e);
3938    }
3939
3940    #[simd_test(enable = "avx2")]
3941    const fn test_mm256_abs_epi16() {
3942        #[rustfmt::skip]
3943        let a = _mm256_setr_epi16(
3944            0,  1, -1, 2, -2, 3, -3, 4,
3945            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3946        );
3947        let r = _mm256_abs_epi16(a);
3948        #[rustfmt::skip]
3949        let e = _mm256_setr_epi16(
3950            0, 1, 1, 2, 2, 3, 3, 4,
3951            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3952        );
3953        assert_eq_m256i(r, e);
3954    }
3955
3956    #[simd_test(enable = "avx2")]
3957    const fn test_mm256_abs_epi8() {
3958        #[rustfmt::skip]
3959        let a = _mm256_setr_epi8(
3960            0, 1, -1, 2, -2, 3, -3, 4,
3961            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3962            0, 1, -1, 2, -2, 3, -3, 4,
3963            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3964        );
3965        let r = _mm256_abs_epi8(a);
3966        #[rustfmt::skip]
3967        let e = _mm256_setr_epi8(
3968            0, 1, 1, 2, 2, 3, 3, 4,
3969            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3970            0, 1, 1, 2, 2, 3, 3, 4,
3971            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3972        );
3973        assert_eq_m256i(r, e);
3974    }
3975
3976    #[simd_test(enable = "avx2")]
3977    const fn test_mm256_add_epi64() {
3978        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3979        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3980        let r = _mm256_add_epi64(a, b);
3981        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3982        assert_eq_m256i(r, e);
3983    }
3984
3985    #[simd_test(enable = "avx2")]
3986    const fn test_mm256_add_epi32() {
3987        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3988        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3989        let r = _mm256_add_epi32(a, b);
3990        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3991        assert_eq_m256i(r, e);
3992    }
3993
3994    #[simd_test(enable = "avx2")]
3995    const fn test_mm256_add_epi16() {
3996        #[rustfmt::skip]
3997        let a = _mm256_setr_epi16(
3998            0, 1, 2, 3, 4, 5, 6, 7,
3999            8, 9, 10, 11, 12, 13, 14, 15,
4000        );
4001        #[rustfmt::skip]
4002        let b = _mm256_setr_epi16(
4003            0, 1, 2, 3, 4, 5, 6, 7,
4004            8, 9, 10, 11, 12, 13, 14, 15,
4005        );
4006        let r = _mm256_add_epi16(a, b);
4007        #[rustfmt::skip]
4008        let e = _mm256_setr_epi16(
4009            0, 2, 4, 6, 8, 10, 12, 14,
4010            16, 18, 20, 22, 24, 26, 28, 30,
4011        );
4012        assert_eq_m256i(r, e);
4013    }
4014
4015    #[simd_test(enable = "avx2")]
4016    const fn test_mm256_add_epi8() {
4017        #[rustfmt::skip]
4018        let a = _mm256_setr_epi8(
4019            0, 1, 2, 3, 4, 5, 6, 7,
4020            8, 9, 10, 11, 12, 13, 14, 15,
4021            16, 17, 18, 19, 20, 21, 22, 23,
4022            24, 25, 26, 27, 28, 29, 30, 31,
4023        );
4024        #[rustfmt::skip]
4025        let b = _mm256_setr_epi8(
4026            0, 1, 2, 3, 4, 5, 6, 7,
4027            8, 9, 10, 11, 12, 13, 14, 15,
4028            16, 17, 18, 19, 20, 21, 22, 23,
4029            24, 25, 26, 27, 28, 29, 30, 31,
4030        );
4031        let r = _mm256_add_epi8(a, b);
4032        #[rustfmt::skip]
4033        let e = _mm256_setr_epi8(
4034            0, 2, 4, 6, 8, 10, 12, 14,
4035            16, 18, 20, 22, 24, 26, 28, 30,
4036            32, 34, 36, 38, 40, 42, 44, 46,
4037            48, 50, 52, 54, 56, 58, 60, 62,
4038        );
4039        assert_eq_m256i(r, e);
4040    }
4041
4042    #[simd_test(enable = "avx2")]
4043    const fn test_mm256_adds_epi8() {
4044        #[rustfmt::skip]
4045        let a = _mm256_setr_epi8(
4046            0, 1, 2, 3, 4, 5, 6, 7,
4047            8, 9, 10, 11, 12, 13, 14, 15,
4048            16, 17, 18, 19, 20, 21, 22, 23,
4049            24, 25, 26, 27, 28, 29, 30, 31,
4050        );
4051        #[rustfmt::skip]
4052        let b = _mm256_setr_epi8(
4053            32, 33, 34, 35, 36, 37, 38, 39,
4054            40, 41, 42, 43, 44, 45, 46, 47,
4055            48, 49, 50, 51, 52, 53, 54, 55,
4056            56, 57, 58, 59, 60, 61, 62, 63,
4057        );
4058        let r = _mm256_adds_epi8(a, b);
4059        #[rustfmt::skip]
4060        let e = _mm256_setr_epi8(
4061            32, 34, 36, 38, 40, 42, 44, 46,
4062            48, 50, 52, 54, 56, 58, 60, 62,
4063            64, 66, 68, 70, 72, 74, 76, 78,
4064            80, 82, 84, 86, 88, 90, 92, 94,
4065        );
4066        assert_eq_m256i(r, e);
4067    }
4068
4069    #[simd_test(enable = "avx2")]
4070    fn test_mm256_adds_epi8_saturate_positive() {
4071        let a = _mm256_set1_epi8(0x7F);
4072        let b = _mm256_set1_epi8(1);
4073        let r = _mm256_adds_epi8(a, b);
4074        assert_eq_m256i(r, a);
4075    }
4076
4077    #[simd_test(enable = "avx2")]
4078    fn test_mm256_adds_epi8_saturate_negative() {
4079        let a = _mm256_set1_epi8(-0x80);
4080        let b = _mm256_set1_epi8(-1);
4081        let r = _mm256_adds_epi8(a, b);
4082        assert_eq_m256i(r, a);
4083    }
4084
4085    #[simd_test(enable = "avx2")]
4086    const fn test_mm256_adds_epi16() {
4087        #[rustfmt::skip]
4088        let a = _mm256_setr_epi16(
4089            0, 1, 2, 3, 4, 5, 6, 7,
4090            8, 9, 10, 11, 12, 13, 14, 15,
4091        );
4092        #[rustfmt::skip]
4093        let b = _mm256_setr_epi16(
4094            32, 33, 34, 35, 36, 37, 38, 39,
4095            40, 41, 42, 43, 44, 45, 46, 47,
4096        );
4097        let r = _mm256_adds_epi16(a, b);
4098        #[rustfmt::skip]
4099        let e = _mm256_setr_epi16(
4100            32, 34, 36, 38, 40, 42, 44, 46,
4101            48, 50, 52, 54, 56, 58, 60, 62,
4102        );
4103
4104        assert_eq_m256i(r, e);
4105    }
4106
4107    #[simd_test(enable = "avx2")]
4108    fn test_mm256_adds_epi16_saturate_positive() {
4109        let a = _mm256_set1_epi16(0x7FFF);
4110        let b = _mm256_set1_epi16(1);
4111        let r = _mm256_adds_epi16(a, b);
4112        assert_eq_m256i(r, a);
4113    }
4114
4115    #[simd_test(enable = "avx2")]
4116    fn test_mm256_adds_epi16_saturate_negative() {
4117        let a = _mm256_set1_epi16(-0x8000);
4118        let b = _mm256_set1_epi16(-1);
4119        let r = _mm256_adds_epi16(a, b);
4120        assert_eq_m256i(r, a);
4121    }
4122
4123    #[simd_test(enable = "avx2")]
4124    const fn test_mm256_adds_epu8() {
4125        #[rustfmt::skip]
4126        let a = _mm256_setr_epi8(
4127            0, 1, 2, 3, 4, 5, 6, 7,
4128            8, 9, 10, 11, 12, 13, 14, 15,
4129            16, 17, 18, 19, 20, 21, 22, 23,
4130            24, 25, 26, 27, 28, 29, 30, 31,
4131        );
4132        #[rustfmt::skip]
4133        let b = _mm256_setr_epi8(
4134            32, 33, 34, 35, 36, 37, 38, 39,
4135            40, 41, 42, 43, 44, 45, 46, 47,
4136            48, 49, 50, 51, 52, 53, 54, 55,
4137            56, 57, 58, 59, 60, 61, 62, 63,
4138        );
4139        let r = _mm256_adds_epu8(a, b);
4140        #[rustfmt::skip]
4141        let e = _mm256_setr_epi8(
4142            32, 34, 36, 38, 40, 42, 44, 46,
4143            48, 50, 52, 54, 56, 58, 60, 62,
4144            64, 66, 68, 70, 72, 74, 76, 78,
4145            80, 82, 84, 86, 88, 90, 92, 94,
4146        );
4147        assert_eq_m256i(r, e);
4148    }
4149
4150    #[simd_test(enable = "avx2")]
4151    fn test_mm256_adds_epu8_saturate() {
4152        let a = _mm256_set1_epi8(!0);
4153        let b = _mm256_set1_epi8(1);
4154        let r = _mm256_adds_epu8(a, b);
4155        assert_eq_m256i(r, a);
4156    }
4157
4158    #[simd_test(enable = "avx2")]
4159    const fn test_mm256_adds_epu16() {
4160        #[rustfmt::skip]
4161        let a = _mm256_setr_epi16(
4162            0, 1, 2, 3, 4, 5, 6, 7,
4163            8, 9, 10, 11, 12, 13, 14, 15,
4164        );
4165        #[rustfmt::skip]
4166        let b = _mm256_setr_epi16(
4167            32, 33, 34, 35, 36, 37, 38, 39,
4168            40, 41, 42, 43, 44, 45, 46, 47,
4169        );
4170        let r = _mm256_adds_epu16(a, b);
4171        #[rustfmt::skip]
4172        let e = _mm256_setr_epi16(
4173            32, 34, 36, 38, 40, 42, 44, 46,
4174            48, 50, 52, 54, 56, 58, 60, 62,
4175        );
4176
4177        assert_eq_m256i(r, e);
4178    }
4179
4180    #[simd_test(enable = "avx2")]
4181    fn test_mm256_adds_epu16_saturate() {
4182        let a = _mm256_set1_epi16(!0);
4183        let b = _mm256_set1_epi16(1);
4184        let r = _mm256_adds_epu16(a, b);
4185        assert_eq_m256i(r, a);
4186    }
4187
4188    #[simd_test(enable = "avx2")]
4189    const fn test_mm256_and_si256() {
4190        let a = _mm256_set1_epi8(5);
4191        let b = _mm256_set1_epi8(3);
4192        let got = _mm256_and_si256(a, b);
4193        assert_eq_m256i(got, _mm256_set1_epi8(1));
4194    }
4195
4196    #[simd_test(enable = "avx2")]
4197    const fn test_mm256_andnot_si256() {
4198        let a = _mm256_set1_epi8(5);
4199        let b = _mm256_set1_epi8(3);
4200        let got = _mm256_andnot_si256(a, b);
4201        assert_eq_m256i(got, _mm256_set1_epi8(2));
4202    }
4203
4204    #[simd_test(enable = "avx2")]
4205    const fn test_mm256_avg_epu8() {
4206        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4207        let r = _mm256_avg_epu8(a, b);
4208        assert_eq_m256i(r, _mm256_set1_epi8(6));
4209    }
4210
4211    #[simd_test(enable = "avx2")]
4212    const fn test_mm256_avg_epu16() {
4213        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4214        let r = _mm256_avg_epu16(a, b);
4215        assert_eq_m256i(r, _mm256_set1_epi16(6));
4216    }
4217
4218    #[simd_test(enable = "avx2")]
4219    const fn test_mm_blend_epi32() {
4220        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4221        let e = _mm_setr_epi32(9, 3, 3, 3);
4222        let r = _mm_blend_epi32::<0x01>(a, b);
4223        assert_eq_m128i(r, e);
4224
4225        let r = _mm_blend_epi32::<0x0E>(b, a);
4226        assert_eq_m128i(r, e);
4227    }
4228
4229    #[simd_test(enable = "avx2")]
4230    const fn test_mm256_blend_epi32() {
4231        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4232        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4233        let r = _mm256_blend_epi32::<0x01>(a, b);
4234        assert_eq_m256i(r, e);
4235
4236        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4237        let r = _mm256_blend_epi32::<0x82>(a, b);
4238        assert_eq_m256i(r, e);
4239
4240        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4241        let r = _mm256_blend_epi32::<0x7C>(a, b);
4242        assert_eq_m256i(r, e);
4243    }
4244
4245    #[simd_test(enable = "avx2")]
4246    const fn test_mm256_blend_epi16() {
4247        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4248        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4249        let r = _mm256_blend_epi16::<0x01>(a, b);
4250        assert_eq_m256i(r, e);
4251
4252        let r = _mm256_blend_epi16::<0xFE>(b, a);
4253        assert_eq_m256i(r, e);
4254    }
4255
4256    #[simd_test(enable = "avx2")]
4257    const fn test_mm256_blendv_epi8() {
4258        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4259        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4260        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4261        let r = _mm256_blendv_epi8(a, b, mask);
4262        assert_eq_m256i(r, e);
4263    }
4264
4265    #[simd_test(enable = "avx2")]
4266    const fn test_mm_broadcastb_epi8() {
4267        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4268        let res = _mm_broadcastb_epi8(a);
4269        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4270    }
4271
4272    #[simd_test(enable = "avx2")]
4273    const fn test_mm256_broadcastb_epi8() {
4274        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4275        let res = _mm256_broadcastb_epi8(a);
4276        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4277    }
4278
4279    #[simd_test(enable = "avx2")]
4280    const fn test_mm_broadcastd_epi32() {
4281        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4282        let res = _mm_broadcastd_epi32(a);
4283        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4284    }
4285
4286    #[simd_test(enable = "avx2")]
4287    const fn test_mm256_broadcastd_epi32() {
4288        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4289        let res = _mm256_broadcastd_epi32(a);
4290        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4291    }
4292
4293    #[simd_test(enable = "avx2")]
4294    const fn test_mm_broadcastq_epi64() {
4295        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4296        let res = _mm_broadcastq_epi64(a);
4297        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4298    }
4299
4300    #[simd_test(enable = "avx2")]
4301    const fn test_mm256_broadcastq_epi64() {
4302        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4303        let res = _mm256_broadcastq_epi64(a);
4304        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4305    }
4306
4307    #[simd_test(enable = "avx2")]
4308    const fn test_mm_broadcastsd_pd() {
4309        let a = _mm_setr_pd(6.88, 3.44);
4310        let res = _mm_broadcastsd_pd(a);
4311        assert_eq_m128d(res, _mm_set1_pd(6.88));
4312    }
4313
4314    #[simd_test(enable = "avx2")]
4315    const fn test_mm256_broadcastsd_pd() {
4316        let a = _mm_setr_pd(6.88, 3.44);
4317        let res = _mm256_broadcastsd_pd(a);
4318        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4319    }
4320
4321    #[simd_test(enable = "avx2")]
4322    const fn test_mm_broadcastsi128_si256() {
4323        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4324        let res = _mm_broadcastsi128_si256(a);
4325        let retval = _mm256_setr_epi64x(
4326            0x0987654321012334,
4327            0x5678909876543210,
4328            0x0987654321012334,
4329            0x5678909876543210,
4330        );
4331        assert_eq_m256i(res, retval);
4332    }
4333
4334    #[simd_test(enable = "avx2")]
4335    const fn test_mm256_broadcastsi128_si256() {
4336        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4337        let res = _mm256_broadcastsi128_si256(a);
4338        let retval = _mm256_setr_epi64x(
4339            0x0987654321012334,
4340            0x5678909876543210,
4341            0x0987654321012334,
4342            0x5678909876543210,
4343        );
4344        assert_eq_m256i(res, retval);
4345    }
4346
4347    #[simd_test(enable = "avx2")]
4348    const fn test_mm_broadcastss_ps() {
4349        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4350        let res = _mm_broadcastss_ps(a);
4351        assert_eq_m128(res, _mm_set1_ps(6.88));
4352    }
4353
4354    #[simd_test(enable = "avx2")]
4355    const fn test_mm256_broadcastss_ps() {
4356        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4357        let res = _mm256_broadcastss_ps(a);
4358        assert_eq_m256(res, _mm256_set1_ps(6.88));
4359    }
4360
4361    #[simd_test(enable = "avx2")]
4362    const fn test_mm_broadcastw_epi16() {
4363        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4364        let res = _mm_broadcastw_epi16(a);
4365        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4366    }
4367
4368    #[simd_test(enable = "avx2")]
4369    const fn test_mm256_broadcastw_epi16() {
4370        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4371        let res = _mm256_broadcastw_epi16(a);
4372        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4373    }
4374
4375    #[simd_test(enable = "avx2")]
4376    const fn test_mm256_cmpeq_epi8() {
4377        #[rustfmt::skip]
4378        let a = _mm256_setr_epi8(
4379            0, 1, 2, 3, 4, 5, 6, 7,
4380            8, 9, 10, 11, 12, 13, 14, 15,
4381            16, 17, 18, 19, 20, 21, 22, 23,
4382            24, 25, 26, 27, 28, 29, 30, 31,
4383        );
4384        #[rustfmt::skip]
4385        let b = _mm256_setr_epi8(
4386            31, 30, 2, 28, 27, 26, 25, 24,
4387            23, 22, 21, 20, 19, 18, 17, 16,
4388            15, 14, 13, 12, 11, 10, 9, 8,
4389            7, 6, 5, 4, 3, 2, 1, 0,
4390        );
4391        let r = _mm256_cmpeq_epi8(a, b);
4392        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4393    }
4394
4395    #[simd_test(enable = "avx2")]
4396    const fn test_mm256_cmpeq_epi16() {
4397        #[rustfmt::skip]
4398        let a = _mm256_setr_epi16(
4399            0, 1, 2, 3, 4, 5, 6, 7,
4400            8, 9, 10, 11, 12, 13, 14, 15,
4401        );
4402        #[rustfmt::skip]
4403        let b = _mm256_setr_epi16(
4404            15, 14, 2, 12, 11, 10, 9, 8,
4405            7, 6, 5, 4, 3, 2, 1, 0,
4406        );
4407        let r = _mm256_cmpeq_epi16(a, b);
4408        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4409    }
4410
4411    #[simd_test(enable = "avx2")]
4412    const fn test_mm256_cmpeq_epi32() {
4413        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4414        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4415        let r = _mm256_cmpeq_epi32(a, b);
4416        let e = _mm256_set1_epi32(0);
4417        let e = _mm256_insert_epi32::<2>(e, !0);
4418        assert_eq_m256i(r, e);
4419    }
4420
4421    #[simd_test(enable = "avx2")]
4422    const fn test_mm256_cmpeq_epi64() {
4423        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4424        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4425        let r = _mm256_cmpeq_epi64(a, b);
4426        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4427    }
4428
4429    #[simd_test(enable = "avx2")]
4430    const fn test_mm256_cmpgt_epi8() {
4431        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4432        let b = _mm256_set1_epi8(0);
4433        let r = _mm256_cmpgt_epi8(a, b);
4434        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4435    }
4436
4437    #[simd_test(enable = "avx2")]
4438    const fn test_mm256_cmpgt_epi16() {
4439        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4440        let b = _mm256_set1_epi16(0);
4441        let r = _mm256_cmpgt_epi16(a, b);
4442        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4443    }
4444
4445    #[simd_test(enable = "avx2")]
4446    const fn test_mm256_cmpgt_epi32() {
4447        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4448        let b = _mm256_set1_epi32(0);
4449        let r = _mm256_cmpgt_epi32(a, b);
4450        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4451    }
4452
4453    #[simd_test(enable = "avx2")]
4454    const fn test_mm256_cmpgt_epi64() {
4455        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4456        let b = _mm256_set1_epi64x(0);
4457        let r = _mm256_cmpgt_epi64(a, b);
4458        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4459    }
4460
4461    #[simd_test(enable = "avx2")]
4462    const fn test_mm256_cvtepi8_epi16() {
4463        #[rustfmt::skip]
4464        let a = _mm_setr_epi8(
4465            0, 0, -1, 1, -2, 2, -3, 3,
4466            -4, 4, -5, 5, -6, 6, -7, 7,
4467        );
4468        #[rustfmt::skip]
4469        let r = _mm256_setr_epi16(
4470            0, 0, -1, 1, -2, 2, -3, 3,
4471            -4, 4, -5, 5, -6, 6, -7, 7,
4472        );
4473        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4474    }
4475
4476    #[simd_test(enable = "avx2")]
4477    const fn test_mm256_cvtepi8_epi32() {
4478        #[rustfmt::skip]
4479        let a = _mm_setr_epi8(
4480            0, 0, -1, 1, -2, 2, -3, 3,
4481            -4, 4, -5, 5, -6, 6, -7, 7,
4482        );
4483        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4484        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4485    }
4486
4487    #[simd_test(enable = "avx2")]
4488    const fn test_mm256_cvtepi8_epi64() {
4489        #[rustfmt::skip]
4490        let a = _mm_setr_epi8(
4491            0, 0, -1, 1, -2, 2, -3, 3,
4492            -4, 4, -5, 5, -6, 6, -7, 7,
4493        );
4494        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4495        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4496    }
4497
4498    #[simd_test(enable = "avx2")]
4499    const fn test_mm256_cvtepi16_epi32() {
4500        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4501        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4502        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4503    }
4504
4505    #[simd_test(enable = "avx2")]
4506    const fn test_mm256_cvtepi16_epi64() {
4507        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4508        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4509        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4510    }
4511
4512    #[simd_test(enable = "avx2")]
4513    const fn test_mm256_cvtepi32_epi64() {
4514        let a = _mm_setr_epi32(0, 0, -1, 1);
4515        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4516        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4517    }
4518
4519    #[simd_test(enable = "avx2")]
4520    const fn test_mm256_cvtepu16_epi32() {
4521        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4522        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4523        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4524    }
4525
4526    #[simd_test(enable = "avx2")]
4527    const fn test_mm256_cvtepu16_epi64() {
4528        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4529        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4530        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4531    }
4532
4533    #[simd_test(enable = "avx2")]
4534    const fn test_mm256_cvtepu32_epi64() {
4535        let a = _mm_setr_epi32(0, 1, 2, 3);
4536        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4537        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4538    }
4539
4540    #[simd_test(enable = "avx2")]
4541    const fn test_mm256_cvtepu8_epi16() {
4542        #[rustfmt::skip]
4543        let a = _mm_setr_epi8(
4544            0, 1, 2, 3, 4, 5, 6, 7,
4545            8, 9, 10, 11, 12, 13, 14, 15,
4546        );
4547        #[rustfmt::skip]
4548        let r = _mm256_setr_epi16(
4549            0, 1, 2, 3, 4, 5, 6, 7,
4550            8, 9, 10, 11, 12, 13, 14, 15,
4551        );
4552        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4553    }
4554
4555    #[simd_test(enable = "avx2")]
4556    const fn test_mm256_cvtepu8_epi32() {
4557        #[rustfmt::skip]
4558        let a = _mm_setr_epi8(
4559            0, 1, 2, 3, 4, 5, 6, 7,
4560            8, 9, 10, 11, 12, 13, 14, 15,
4561        );
4562        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4563        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4564    }
4565
4566    #[simd_test(enable = "avx2")]
4567    const fn test_mm256_cvtepu8_epi64() {
4568        #[rustfmt::skip]
4569        let a = _mm_setr_epi8(
4570            0, 1, 2, 3, 4, 5, 6, 7,
4571            8, 9, 10, 11, 12, 13, 14, 15,
4572        );
4573        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4574        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4575    }
4576
4577    #[simd_test(enable = "avx2")]
4578    const fn test_mm256_extracti128_si256() {
4579        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4580        let r = _mm256_extracti128_si256::<1>(a);
4581        let e = _mm_setr_epi64x(3, 4);
4582        assert_eq_m128i(r, e);
4583    }
4584
4585    #[simd_test(enable = "avx2")]
4586    const fn test_mm256_hadd_epi16() {
4587        let a = _mm256_set1_epi16(2);
4588        let b = _mm256_set1_epi16(4);
4589        let r = _mm256_hadd_epi16(a, b);
4590        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4591        assert_eq_m256i(r, e);
4592    }
4593
4594    #[simd_test(enable = "avx2")]
4595    const fn test_mm256_hadd_epi32() {
4596        let a = _mm256_set1_epi32(2);
4597        let b = _mm256_set1_epi32(4);
4598        let r = _mm256_hadd_epi32(a, b);
4599        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4600        assert_eq_m256i(r, e);
4601    }
4602
4603    #[simd_test(enable = "avx2")]
4604    fn test_mm256_hadds_epi16() {
4605        let a = _mm256_set1_epi16(2);
4606        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4607        let a = _mm256_insert_epi16::<1>(a, 1);
4608        let b = _mm256_set1_epi16(4);
4609        let r = _mm256_hadds_epi16(a, b);
4610        #[rustfmt::skip]
4611        let e = _mm256_setr_epi16(
4612            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4613            4, 4, 4, 4, 8, 8, 8, 8,
4614        );
4615        assert_eq_m256i(r, e);
4616    }
4617
4618    #[simd_test(enable = "avx2")]
4619    const fn test_mm256_hsub_epi16() {
4620        let a = _mm256_set1_epi16(2);
4621        let b = _mm256_set1_epi16(4);
4622        let r = _mm256_hsub_epi16(a, b);
4623        let e = _mm256_set1_epi16(0);
4624        assert_eq_m256i(r, e);
4625    }
4626
4627    #[simd_test(enable = "avx2")]
4628    const fn test_mm256_hsub_epi32() {
4629        let a = _mm256_set1_epi32(2);
4630        let b = _mm256_set1_epi32(4);
4631        let r = _mm256_hsub_epi32(a, b);
4632        let e = _mm256_set1_epi32(0);
4633        assert_eq_m256i(r, e);
4634    }
4635
4636    #[simd_test(enable = "avx2")]
4637    fn test_mm256_hsubs_epi16() {
4638        let a = _mm256_set1_epi16(2);
4639        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4640        let a = _mm256_insert_epi16::<1>(a, -1);
4641        let b = _mm256_set1_epi16(4);
4642        let r = _mm256_hsubs_epi16(a, b);
4643        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4644        assert_eq_m256i(r, e);
4645    }
4646
4647    #[simd_test(enable = "avx2")]
4648    fn test_mm256_madd_epi16() {
4649        let a = _mm256_set1_epi16(2);
4650        let b = _mm256_set1_epi16(4);
4651        let r = _mm256_madd_epi16(a, b);
4652        let e = _mm256_set1_epi32(16);
4653        assert_eq_m256i(r, e);
4654    }
4655
4656    #[simd_test(enable = "avx2")]
4657    const fn test_mm256_inserti128_si256() {
4658        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4659        let b = _mm_setr_epi64x(7, 8);
4660        let r = _mm256_inserti128_si256::<1>(a, b);
4661        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4662        assert_eq_m256i(r, e);
4663    }
4664
4665    #[simd_test(enable = "avx2")]
4666    fn test_mm256_maddubs_epi16() {
4667        let a = _mm256_set1_epi8(2);
4668        let b = _mm256_set1_epi8(4);
4669        let r = _mm256_maddubs_epi16(a, b);
4670        let e = _mm256_set1_epi16(16);
4671        assert_eq_m256i(r, e);
4672    }
4673
4674    #[simd_test(enable = "avx2")]
4675    const fn test_mm_maskload_epi32() {
4676        let nums = [1, 2, 3, 4];
4677        let a = &nums as *const i32;
4678        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4679        let r = unsafe { _mm_maskload_epi32(a, mask) };
4680        let e = _mm_setr_epi32(1, 0, 0, 4);
4681        assert_eq_m128i(r, e);
4682    }
4683
4684    #[simd_test(enable = "avx2")]
4685    const fn test_mm256_maskload_epi32() {
4686        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4687        let a = &nums as *const i32;
4688        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4689        let r = unsafe { _mm256_maskload_epi32(a, mask) };
4690        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4691        assert_eq_m256i(r, e);
4692    }
4693
4694    #[simd_test(enable = "avx2")]
4695    const fn test_mm_maskload_epi64() {
4696        let nums = [1_i64, 2_i64];
4697        let a = &nums as *const i64;
4698        let mask = _mm_setr_epi64x(0, -1);
4699        let r = unsafe { _mm_maskload_epi64(a, mask) };
4700        let e = _mm_setr_epi64x(0, 2);
4701        assert_eq_m128i(r, e);
4702    }
4703
4704    #[simd_test(enable = "avx2")]
4705    const fn test_mm256_maskload_epi64() {
4706        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4707        let a = &nums as *const i64;
4708        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4709        let r = unsafe { _mm256_maskload_epi64(a, mask) };
4710        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4711        assert_eq_m256i(r, e);
4712    }
4713
4714    #[simd_test(enable = "avx2")]
4715    const fn test_mm_maskstore_epi32() {
4716        let a = _mm_setr_epi32(1, 2, 3, 4);
4717        let mut arr = [-1, -1, -1, -1];
4718        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4719        unsafe {
4720            _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4721        }
4722        let e = [1, -1, -1, 4];
4723        assert_eq!(arr, e);
4724    }
4725
4726    #[simd_test(enable = "avx2")]
4727    const fn test_mm256_maskstore_epi32() {
4728        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4729        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4730        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4731        unsafe {
4732            _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4733        }
4734        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4735        assert_eq!(arr, e);
4736    }
4737
4738    #[simd_test(enable = "avx2")]
4739    const fn test_mm_maskstore_epi64() {
4740        let a = _mm_setr_epi64x(1_i64, 2_i64);
4741        let mut arr = [-1_i64, -1_i64];
4742        let mask = _mm_setr_epi64x(0, -1);
4743        unsafe {
4744            _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4745        }
4746        let e = [-1, 2];
4747        assert_eq!(arr, e);
4748    }
4749
4750    #[simd_test(enable = "avx2")]
4751    const fn test_mm256_maskstore_epi64() {
4752        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4753        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4754        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4755        unsafe {
4756            _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4757        }
4758        let e = [-1, 2, 3, -1];
4759        assert_eq!(arr, e);
4760    }
4761
4762    #[simd_test(enable = "avx2")]
4763    const fn test_mm256_max_epi16() {
4764        let a = _mm256_set1_epi16(2);
4765        let b = _mm256_set1_epi16(4);
4766        let r = _mm256_max_epi16(a, b);
4767        assert_eq_m256i(r, b);
4768    }
4769
4770    #[simd_test(enable = "avx2")]
4771    const fn test_mm256_max_epi32() {
4772        let a = _mm256_set1_epi32(2);
4773        let b = _mm256_set1_epi32(4);
4774        let r = _mm256_max_epi32(a, b);
4775        assert_eq_m256i(r, b);
4776    }
4777
4778    #[simd_test(enable = "avx2")]
4779    const fn test_mm256_max_epi8() {
4780        let a = _mm256_set1_epi8(2);
4781        let b = _mm256_set1_epi8(4);
4782        let r = _mm256_max_epi8(a, b);
4783        assert_eq_m256i(r, b);
4784    }
4785
4786    #[simd_test(enable = "avx2")]
4787    const fn test_mm256_max_epu16() {
4788        let a = _mm256_set1_epi16(2);
4789        let b = _mm256_set1_epi16(4);
4790        let r = _mm256_max_epu16(a, b);
4791        assert_eq_m256i(r, b);
4792    }
4793
4794    #[simd_test(enable = "avx2")]
4795    const fn test_mm256_max_epu32() {
4796        let a = _mm256_set1_epi32(2);
4797        let b = _mm256_set1_epi32(4);
4798        let r = _mm256_max_epu32(a, b);
4799        assert_eq_m256i(r, b);
4800    }
4801
4802    #[simd_test(enable = "avx2")]
4803    const fn test_mm256_max_epu8() {
4804        let a = _mm256_set1_epi8(2);
4805        let b = _mm256_set1_epi8(4);
4806        let r = _mm256_max_epu8(a, b);
4807        assert_eq_m256i(r, b);
4808    }
4809
4810    #[simd_test(enable = "avx2")]
4811    const fn test_mm256_min_epi16() {
4812        let a = _mm256_set1_epi16(2);
4813        let b = _mm256_set1_epi16(4);
4814        let r = _mm256_min_epi16(a, b);
4815        assert_eq_m256i(r, a);
4816    }
4817
4818    #[simd_test(enable = "avx2")]
4819    const fn test_mm256_min_epi32() {
4820        let a = _mm256_set1_epi32(2);
4821        let b = _mm256_set1_epi32(4);
4822        let r = _mm256_min_epi32(a, b);
4823        assert_eq_m256i(r, a);
4824    }
4825
4826    #[simd_test(enable = "avx2")]
4827    const fn test_mm256_min_epi8() {
4828        let a = _mm256_set1_epi8(2);
4829        let b = _mm256_set1_epi8(4);
4830        let r = _mm256_min_epi8(a, b);
4831        assert_eq_m256i(r, a);
4832    }
4833
4834    #[simd_test(enable = "avx2")]
4835    const fn test_mm256_min_epu16() {
4836        let a = _mm256_set1_epi16(2);
4837        let b = _mm256_set1_epi16(4);
4838        let r = _mm256_min_epu16(a, b);
4839        assert_eq_m256i(r, a);
4840    }
4841
4842    #[simd_test(enable = "avx2")]
4843    const fn test_mm256_min_epu32() {
4844        let a = _mm256_set1_epi32(2);
4845        let b = _mm256_set1_epi32(4);
4846        let r = _mm256_min_epu32(a, b);
4847        assert_eq_m256i(r, a);
4848    }
4849
4850    #[simd_test(enable = "avx2")]
4851    const fn test_mm256_min_epu8() {
4852        let a = _mm256_set1_epi8(2);
4853        let b = _mm256_set1_epi8(4);
4854        let r = _mm256_min_epu8(a, b);
4855        assert_eq_m256i(r, a);
4856    }
4857
4858    #[simd_test(enable = "avx2")]
4859    const fn test_mm256_movemask_epi8() {
4860        let a = _mm256_set1_epi8(-1);
4861        let r = _mm256_movemask_epi8(a);
4862        let e = -1;
4863        assert_eq!(r, e);
4864    }
4865
4866    #[simd_test(enable = "avx2")]
4867    fn test_mm256_mpsadbw_epu8() {
4868        let a = _mm256_set1_epi8(2);
4869        let b = _mm256_set1_epi8(4);
4870        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4871        let e = _mm256_set1_epi16(8);
4872        assert_eq_m256i(r, e);
4873    }
4874
4875    #[simd_test(enable = "avx2")]
4876    const fn test_mm256_mul_epi32() {
4877        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4878        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4879        let r = _mm256_mul_epi32(a, b);
4880        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4881        assert_eq_m256i(r, e);
4882    }
4883
4884    #[simd_test(enable = "avx2")]
4885    const fn test_mm256_mul_epu32() {
4886        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4887        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4888        let r = _mm256_mul_epu32(a, b);
4889        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4890        assert_eq_m256i(r, e);
4891    }
4892
4893    #[simd_test(enable = "avx2")]
4894    const fn test_mm256_mulhi_epi16() {
4895        let a = _mm256_set1_epi16(6535);
4896        let b = _mm256_set1_epi16(6535);
4897        let r = _mm256_mulhi_epi16(a, b);
4898        let e = _mm256_set1_epi16(651);
4899        assert_eq_m256i(r, e);
4900    }
4901
4902    #[simd_test(enable = "avx2")]
4903    const fn test_mm256_mulhi_epu16() {
4904        let a = _mm256_set1_epi16(6535);
4905        let b = _mm256_set1_epi16(6535);
4906        let r = _mm256_mulhi_epu16(a, b);
4907        let e = _mm256_set1_epi16(651);
4908        assert_eq_m256i(r, e);
4909    }
4910
4911    #[simd_test(enable = "avx2")]
4912    const fn test_mm256_mullo_epi16() {
4913        let a = _mm256_set1_epi16(2);
4914        let b = _mm256_set1_epi16(4);
4915        let r = _mm256_mullo_epi16(a, b);
4916        let e = _mm256_set1_epi16(8);
4917        assert_eq_m256i(r, e);
4918    }
4919
4920    #[simd_test(enable = "avx2")]
4921    const fn test_mm256_mullo_epi32() {
4922        let a = _mm256_set1_epi32(2);
4923        let b = _mm256_set1_epi32(4);
4924        let r = _mm256_mullo_epi32(a, b);
4925        let e = _mm256_set1_epi32(8);
4926        assert_eq_m256i(r, e);
4927    }
4928
4929    #[simd_test(enable = "avx2")]
4930    fn test_mm256_mulhrs_epi16() {
4931        let a = _mm256_set1_epi16(2);
4932        let b = _mm256_set1_epi16(4);
4933        let r = _mm256_mullo_epi16(a, b);
4934        let e = _mm256_set1_epi16(8);
4935        assert_eq_m256i(r, e);
4936    }
4937
4938    #[simd_test(enable = "avx2")]
4939    const fn test_mm256_or_si256() {
4940        let a = _mm256_set1_epi8(-1);
4941        let b = _mm256_set1_epi8(0);
4942        let r = _mm256_or_si256(a, b);
4943        assert_eq_m256i(r, a);
4944    }
4945
4946    #[simd_test(enable = "avx2")]
4947    fn test_mm256_packs_epi16() {
4948        let a = _mm256_set1_epi16(2);
4949        let b = _mm256_set1_epi16(4);
4950        let r = _mm256_packs_epi16(a, b);
4951        #[rustfmt::skip]
4952        let e = _mm256_setr_epi8(
4953            2, 2, 2, 2, 2, 2, 2, 2,
4954            4, 4, 4, 4, 4, 4, 4, 4,
4955            2, 2, 2, 2, 2, 2, 2, 2,
4956            4, 4, 4, 4, 4, 4, 4, 4,
4957        );
4958
4959        assert_eq_m256i(r, e);
4960    }
4961
4962    #[simd_test(enable = "avx2")]
4963    fn test_mm256_packs_epi32() {
4964        let a = _mm256_set1_epi32(2);
4965        let b = _mm256_set1_epi32(4);
4966        let r = _mm256_packs_epi32(a, b);
4967        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4968
4969        assert_eq_m256i(r, e);
4970    }
4971
4972    #[simd_test(enable = "avx2")]
4973    fn test_mm256_packus_epi16() {
4974        let a = _mm256_set1_epi16(2);
4975        let b = _mm256_set1_epi16(4);
4976        let r = _mm256_packus_epi16(a, b);
4977        #[rustfmt::skip]
4978        let e = _mm256_setr_epi8(
4979            2, 2, 2, 2, 2, 2, 2, 2,
4980            4, 4, 4, 4, 4, 4, 4, 4,
4981            2, 2, 2, 2, 2, 2, 2, 2,
4982            4, 4, 4, 4, 4, 4, 4, 4,
4983        );
4984
4985        assert_eq_m256i(r, e);
4986    }
4987
4988    #[simd_test(enable = "avx2")]
4989    fn test_mm256_packus_epi32() {
4990        let a = _mm256_set1_epi32(2);
4991        let b = _mm256_set1_epi32(4);
4992        let r = _mm256_packus_epi32(a, b);
4993        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4994
4995        assert_eq_m256i(r, e);
4996    }
4997
4998    #[simd_test(enable = "avx2")]
4999    fn test_mm256_sad_epu8() {
5000        let a = _mm256_set1_epi8(2);
5001        let b = _mm256_set1_epi8(4);
5002        let r = _mm256_sad_epu8(a, b);
5003        let e = _mm256_set1_epi64x(16);
5004        assert_eq_m256i(r, e);
5005    }
5006
5007    #[simd_test(enable = "avx2")]
5008    const fn test_mm256_shufflehi_epi16() {
5009        #[rustfmt::skip]
5010        let a = _mm256_setr_epi16(
5011            0, 1, 2, 3, 11, 22, 33, 44,
5012            4, 5, 6, 7, 55, 66, 77, 88,
5013        );
5014        #[rustfmt::skip]
5015        let e = _mm256_setr_epi16(
5016            0, 1, 2, 3, 44, 22, 22, 11,
5017            4, 5, 6, 7, 88, 66, 66, 55,
5018        );
5019        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
5020        assert_eq_m256i(r, e);
5021    }
5022
5023    #[simd_test(enable = "avx2")]
5024    const fn test_mm256_shufflelo_epi16() {
5025        #[rustfmt::skip]
5026        let a = _mm256_setr_epi16(
5027            11, 22, 33, 44, 0, 1, 2, 3,
5028            55, 66, 77, 88, 4, 5, 6, 7,
5029        );
5030        #[rustfmt::skip]
5031        let e = _mm256_setr_epi16(
5032            44, 22, 22, 11, 0, 1, 2, 3,
5033            88, 66, 66, 55, 4, 5, 6, 7,
5034        );
5035        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
5036        assert_eq_m256i(r, e);
5037    }
5038
5039    #[simd_test(enable = "avx2")]
5040    fn test_mm256_sign_epi16() {
5041        let a = _mm256_set1_epi16(2);
5042        let b = _mm256_set1_epi16(-1);
5043        let r = _mm256_sign_epi16(a, b);
5044        let e = _mm256_set1_epi16(-2);
5045        assert_eq_m256i(r, e);
5046    }
5047
5048    #[simd_test(enable = "avx2")]
5049    fn test_mm256_sign_epi32() {
5050        let a = _mm256_set1_epi32(2);
5051        let b = _mm256_set1_epi32(-1);
5052        let r = _mm256_sign_epi32(a, b);
5053        let e = _mm256_set1_epi32(-2);
5054        assert_eq_m256i(r, e);
5055    }
5056
5057    #[simd_test(enable = "avx2")]
5058    fn test_mm256_sign_epi8() {
5059        let a = _mm256_set1_epi8(2);
5060        let b = _mm256_set1_epi8(-1);
5061        let r = _mm256_sign_epi8(a, b);
5062        let e = _mm256_set1_epi8(-2);
5063        assert_eq_m256i(r, e);
5064    }
5065
5066    #[simd_test(enable = "avx2")]
5067    fn test_mm256_sll_epi16() {
5068        let a = _mm256_set1_epi16(0xFF);
5069        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5070        let r = _mm256_sll_epi16(a, b);
5071        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5072    }
5073
5074    #[simd_test(enable = "avx2")]
5075    fn test_mm256_sll_epi32() {
5076        let a = _mm256_set1_epi32(0xFFFF);
5077        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5078        let r = _mm256_sll_epi32(a, b);
5079        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5080    }
5081
5082    #[simd_test(enable = "avx2")]
5083    fn test_mm256_sll_epi64() {
5084        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5085        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
5086        let r = _mm256_sll_epi64(a, b);
5087        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5088    }
5089
5090    #[simd_test(enable = "avx2")]
5091    const fn test_mm256_slli_epi16() {
5092        assert_eq_m256i(
5093            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5094            _mm256_set1_epi16(0xFF0),
5095        );
5096    }
5097
5098    #[simd_test(enable = "avx2")]
5099    const fn test_mm256_slli_epi32() {
5100        assert_eq_m256i(
5101            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5102            _mm256_set1_epi32(0xFFFF0),
5103        );
5104    }
5105
5106    #[simd_test(enable = "avx2")]
5107    const fn test_mm256_slli_epi64() {
5108        assert_eq_m256i(
5109            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5110            _mm256_set1_epi64x(0xFFFFFFFF0),
5111        );
5112    }
5113
5114    #[simd_test(enable = "avx2")]
5115    const fn test_mm256_slli_si256() {
5116        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5117        let r = _mm256_slli_si256::<3>(a);
5118        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5119    }
5120
5121    #[simd_test(enable = "avx2")]
5122    const fn test_mm_sllv_epi32() {
5123        let a = _mm_set1_epi32(2);
5124        let b = _mm_set1_epi32(1);
5125        let r = _mm_sllv_epi32(a, b);
5126        let e = _mm_set1_epi32(4);
5127        assert_eq_m128i(r, e);
5128    }
5129
5130    #[simd_test(enable = "avx2")]
5131    const fn test_mm256_sllv_epi32() {
5132        let a = _mm256_set1_epi32(2);
5133        let b = _mm256_set1_epi32(1);
5134        let r = _mm256_sllv_epi32(a, b);
5135        let e = _mm256_set1_epi32(4);
5136        assert_eq_m256i(r, e);
5137    }
5138
5139    #[simd_test(enable = "avx2")]
5140    const fn test_mm_sllv_epi64() {
5141        let a = _mm_set1_epi64x(2);
5142        let b = _mm_set1_epi64x(1);
5143        let r = _mm_sllv_epi64(a, b);
5144        let e = _mm_set1_epi64x(4);
5145        assert_eq_m128i(r, e);
5146    }
5147
5148    #[simd_test(enable = "avx2")]
5149    const fn test_mm256_sllv_epi64() {
5150        let a = _mm256_set1_epi64x(2);
5151        let b = _mm256_set1_epi64x(1);
5152        let r = _mm256_sllv_epi64(a, b);
5153        let e = _mm256_set1_epi64x(4);
5154        assert_eq_m256i(r, e);
5155    }
5156
5157    #[simd_test(enable = "avx2")]
5158    fn test_mm256_sra_epi16() {
5159        let a = _mm256_set1_epi16(-1);
5160        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5161        let r = _mm256_sra_epi16(a, b);
5162        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5163    }
5164
5165    #[simd_test(enable = "avx2")]
5166    fn test_mm256_sra_epi32() {
5167        let a = _mm256_set1_epi32(-1);
5168        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5169        let r = _mm256_sra_epi32(a, b);
5170        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5171    }
5172
5173    #[simd_test(enable = "avx2")]
5174    const fn test_mm256_srai_epi16() {
5175        assert_eq_m256i(
5176            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5177            _mm256_set1_epi16(-1),
5178        );
5179    }
5180
5181    #[simd_test(enable = "avx2")]
5182    const fn test_mm256_srai_epi32() {
5183        assert_eq_m256i(
5184            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5185            _mm256_set1_epi32(-1),
5186        );
5187    }
5188
5189    #[simd_test(enable = "avx2")]
5190    const fn test_mm_srav_epi32() {
5191        let a = _mm_set1_epi32(4);
5192        let count = _mm_set1_epi32(1);
5193        let r = _mm_srav_epi32(a, count);
5194        let e = _mm_set1_epi32(2);
5195        assert_eq_m128i(r, e);
5196    }
5197
5198    #[simd_test(enable = "avx2")]
5199    const fn test_mm256_srav_epi32() {
5200        let a = _mm256_set1_epi32(4);
5201        let count = _mm256_set1_epi32(1);
5202        let r = _mm256_srav_epi32(a, count);
5203        let e = _mm256_set1_epi32(2);
5204        assert_eq_m256i(r, e);
5205    }
5206
5207    #[simd_test(enable = "avx2")]
5208    const fn test_mm256_srli_si256() {
5209        #[rustfmt::skip]
5210        let a = _mm256_setr_epi8(
5211            1, 2, 3, 4, 5, 6, 7, 8,
5212            9, 10, 11, 12, 13, 14, 15, 16,
5213            17, 18, 19, 20, 21, 22, 23, 24,
5214            25, 26, 27, 28, 29, 30, 31, 32,
5215        );
5216        let r = _mm256_srli_si256::<3>(a);
5217        #[rustfmt::skip]
5218        let e = _mm256_setr_epi8(
5219            4, 5, 6, 7, 8, 9, 10, 11,
5220            12, 13, 14, 15, 16, 0, 0, 0,
5221            20, 21, 22, 23, 24, 25, 26, 27,
5222            28, 29, 30, 31, 32, 0, 0, 0,
5223        );
5224        assert_eq_m256i(r, e);
5225    }
5226
5227    #[simd_test(enable = "avx2")]
5228    fn test_mm256_srl_epi16() {
5229        let a = _mm256_set1_epi16(0xFF);
5230        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5231        let r = _mm256_srl_epi16(a, b);
5232        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5233    }
5234
5235    #[simd_test(enable = "avx2")]
5236    fn test_mm256_srl_epi32() {
5237        let a = _mm256_set1_epi32(0xFFFF);
5238        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5239        let r = _mm256_srl_epi32(a, b);
5240        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5241    }
5242
5243    #[simd_test(enable = "avx2")]
5244    fn test_mm256_srl_epi64() {
5245        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5246        let b = _mm_setr_epi64x(4, 0);
5247        let r = _mm256_srl_epi64(a, b);
5248        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5249    }
5250
5251    #[simd_test(enable = "avx2")]
5252    const fn test_mm256_srli_epi16() {
5253        assert_eq_m256i(
5254            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5255            _mm256_set1_epi16(0xF),
5256        );
5257    }
5258
5259    #[simd_test(enable = "avx2")]
5260    const fn test_mm256_srli_epi32() {
5261        assert_eq_m256i(
5262            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5263            _mm256_set1_epi32(0xFFF),
5264        );
5265    }
5266
5267    #[simd_test(enable = "avx2")]
5268    const fn test_mm256_srli_epi64() {
5269        assert_eq_m256i(
5270            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5271            _mm256_set1_epi64x(0xFFFFFFF),
5272        );
5273    }
5274
5275    #[simd_test(enable = "avx2")]
5276    const fn test_mm_srlv_epi32() {
5277        let a = _mm_set1_epi32(2);
5278        let count = _mm_set1_epi32(1);
5279        let r = _mm_srlv_epi32(a, count);
5280        let e = _mm_set1_epi32(1);
5281        assert_eq_m128i(r, e);
5282    }
5283
5284    #[simd_test(enable = "avx2")]
5285    const fn test_mm256_srlv_epi32() {
5286        let a = _mm256_set1_epi32(2);
5287        let count = _mm256_set1_epi32(1);
5288        let r = _mm256_srlv_epi32(a, count);
5289        let e = _mm256_set1_epi32(1);
5290        assert_eq_m256i(r, e);
5291    }
5292
5293    #[simd_test(enable = "avx2")]
5294    const fn test_mm_srlv_epi64() {
5295        let a = _mm_set1_epi64x(2);
5296        let count = _mm_set1_epi64x(1);
5297        let r = _mm_srlv_epi64(a, count);
5298        let e = _mm_set1_epi64x(1);
5299        assert_eq_m128i(r, e);
5300    }
5301
5302    #[simd_test(enable = "avx2")]
5303    const fn test_mm256_srlv_epi64() {
5304        let a = _mm256_set1_epi64x(2);
5305        let count = _mm256_set1_epi64x(1);
5306        let r = _mm256_srlv_epi64(a, count);
5307        let e = _mm256_set1_epi64x(1);
5308        assert_eq_m256i(r, e);
5309    }
5310
5311    #[simd_test(enable = "avx2")]
5312    fn test_mm256_stream_load_si256() {
5313        let a = _mm256_set_epi64x(5, 6, 7, 8);
5314        let r = unsafe { _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _) };
5315        assert_eq_m256i(a, r);
5316    }
5317
5318    #[simd_test(enable = "avx2")]
5319    const fn test_mm256_sub_epi16() {
5320        let a = _mm256_set1_epi16(4);
5321        let b = _mm256_set1_epi16(2);
5322        let r = _mm256_sub_epi16(a, b);
5323        assert_eq_m256i(r, b);
5324    }
5325
5326    #[simd_test(enable = "avx2")]
5327    const fn test_mm256_sub_epi32() {
5328        let a = _mm256_set1_epi32(4);
5329        let b = _mm256_set1_epi32(2);
5330        let r = _mm256_sub_epi32(a, b);
5331        assert_eq_m256i(r, b);
5332    }
5333
5334    #[simd_test(enable = "avx2")]
5335    const fn test_mm256_sub_epi64() {
5336        let a = _mm256_set1_epi64x(4);
5337        let b = _mm256_set1_epi64x(2);
5338        let r = _mm256_sub_epi64(a, b);
5339        assert_eq_m256i(r, b);
5340    }
5341
5342    #[simd_test(enable = "avx2")]
5343    const fn test_mm256_sub_epi8() {
5344        let a = _mm256_set1_epi8(4);
5345        let b = _mm256_set1_epi8(2);
5346        let r = _mm256_sub_epi8(a, b);
5347        assert_eq_m256i(r, b);
5348    }
5349
5350    #[simd_test(enable = "avx2")]
5351    const fn test_mm256_subs_epi16() {
5352        let a = _mm256_set1_epi16(4);
5353        let b = _mm256_set1_epi16(2);
5354        let r = _mm256_subs_epi16(a, b);
5355        assert_eq_m256i(r, b);
5356    }
5357
5358    #[simd_test(enable = "avx2")]
5359    const fn test_mm256_subs_epi8() {
5360        let a = _mm256_set1_epi8(4);
5361        let b = _mm256_set1_epi8(2);
5362        let r = _mm256_subs_epi8(a, b);
5363        assert_eq_m256i(r, b);
5364    }
5365
5366    #[simd_test(enable = "avx2")]
5367    const fn test_mm256_subs_epu16() {
5368        let a = _mm256_set1_epi16(4);
5369        let b = _mm256_set1_epi16(2);
5370        let r = _mm256_subs_epu16(a, b);
5371        assert_eq_m256i(r, b);
5372    }
5373
5374    #[simd_test(enable = "avx2")]
5375    const fn test_mm256_subs_epu8() {
5376        let a = _mm256_set1_epi8(4);
5377        let b = _mm256_set1_epi8(2);
5378        let r = _mm256_subs_epu8(a, b);
5379        assert_eq_m256i(r, b);
5380    }
5381
5382    #[simd_test(enable = "avx2")]
5383    const fn test_mm256_xor_si256() {
5384        let a = _mm256_set1_epi8(5);
5385        let b = _mm256_set1_epi8(3);
5386        let r = _mm256_xor_si256(a, b);
5387        assert_eq_m256i(r, _mm256_set1_epi8(6));
5388    }
5389
5390    #[simd_test(enable = "avx2")]
5391    const fn test_mm256_alignr_epi8() {
5392        #[rustfmt::skip]
5393        let a = _mm256_setr_epi8(
5394            1, 2, 3, 4, 5, 6, 7, 8,
5395            9, 10, 11, 12, 13, 14, 15, 16,
5396            17, 18, 19, 20, 21, 22, 23, 24,
5397            25, 26, 27, 28, 29, 30, 31, 32,
5398        );
5399        #[rustfmt::skip]
5400        let b = _mm256_setr_epi8(
5401            -1, -2, -3, -4, -5, -6, -7, -8,
5402            -9, -10, -11, -12, -13, -14, -15, -16,
5403            -17, -18, -19, -20, -21, -22, -23, -24,
5404            -25, -26, -27, -28, -29, -30, -31, -32,
5405        );
5406        let r = _mm256_alignr_epi8::<33>(a, b);
5407        assert_eq_m256i(r, _mm256_set1_epi8(0));
5408
5409        let r = _mm256_alignr_epi8::<17>(a, b);
5410        #[rustfmt::skip]
5411        let expected = _mm256_setr_epi8(
5412            2, 3, 4, 5, 6, 7, 8, 9,
5413            10, 11, 12, 13, 14, 15, 16, 0,
5414            18, 19, 20, 21, 22, 23, 24, 25,
5415            26, 27, 28, 29, 30, 31, 32, 0,
5416        );
5417        assert_eq_m256i(r, expected);
5418
5419        let r = _mm256_alignr_epi8::<4>(a, b);
5420        #[rustfmt::skip]
5421        let expected = _mm256_setr_epi8(
5422            -5, -6, -7, -8, -9, -10, -11, -12,
5423            -13, -14, -15, -16, 1, 2, 3, 4,
5424            -21, -22, -23, -24, -25, -26, -27, -28,
5425            -29, -30, -31, -32, 17, 18, 19, 20,
5426        );
5427        assert_eq_m256i(r, expected);
5428
5429        let r = _mm256_alignr_epi8::<15>(a, b);
5430        #[rustfmt::skip]
5431        let expected = _mm256_setr_epi8(
5432            -16, 1, 2, 3, 4, 5, 6, 7,
5433            8, 9, 10, 11, 12, 13, 14, 15,
5434            -32, 17, 18, 19, 20, 21, 22, 23,
5435            24, 25, 26, 27, 28, 29, 30, 31,
5436        );
5437        assert_eq_m256i(r, expected);
5438
5439        let r = _mm256_alignr_epi8::<0>(a, b);
5440        assert_eq_m256i(r, b);
5441
5442        let r = _mm256_alignr_epi8::<16>(a, b);
5443        assert_eq_m256i(r, a);
5444    }
5445
5446    #[simd_test(enable = "avx2")]
5447    fn test_mm256_shuffle_epi8() {
5448        #[rustfmt::skip]
5449        let a = _mm256_setr_epi8(
5450            1, 2, 3, 4, 5, 6, 7, 8,
5451            9, 10, 11, 12, 13, 14, 15, 16,
5452            17, 18, 19, 20, 21, 22, 23, 24,
5453            25, 26, 27, 28, 29, 30, 31, 32,
5454        );
5455        #[rustfmt::skip]
5456        let b = _mm256_setr_epi8(
5457            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5458            12, 5, 5, 10, 4, 1, 8, 0,
5459            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5460            12, 5, 5, 10, 4, 1, 8, 0,
5461        );
5462        #[rustfmt::skip]
5463        let expected = _mm256_setr_epi8(
5464            5, 0, 5, 4, 9, 13, 7, 4,
5465            13, 6, 6, 11, 5, 2, 9, 1,
5466            21, 0, 21, 20, 25, 29, 23, 20,
5467            29, 22, 22, 27, 21, 18, 25, 17,
5468        );
5469        let r = _mm256_shuffle_epi8(a, b);
5470        assert_eq_m256i(r, expected);
5471    }
5472
5473    #[simd_test(enable = "avx2")]
5474    fn test_mm256_permutevar8x32_epi32() {
5475        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5476        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5477        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5478        let r = _mm256_permutevar8x32_epi32(a, b);
5479        assert_eq_m256i(r, expected);
5480    }
5481
5482    #[simd_test(enable = "avx2")]
5483    const fn test_mm256_permute4x64_epi64() {
5484        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5485        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5486        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5487        assert_eq_m256i(r, expected);
5488    }
5489
5490    #[simd_test(enable = "avx2")]
5491    const fn test_mm256_permute2x128_si256() {
5492        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5493        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5494        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5495        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5496        assert_eq_m256i(r, e);
5497    }
5498
5499    #[simd_test(enable = "avx2")]
5500    const fn test_mm256_permute4x64_pd() {
5501        let a = _mm256_setr_pd(1., 2., 3., 4.);
5502        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5503        let e = _mm256_setr_pd(4., 1., 2., 1.);
5504        assert_eq_m256d(r, e);
5505    }
5506
5507    #[simd_test(enable = "avx2")]
5508    fn test_mm256_permutevar8x32_ps() {
5509        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5510        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5511        let r = _mm256_permutevar8x32_ps(a, b);
5512        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5513        assert_eq_m256(r, e);
5514    }
5515
5516    #[simd_test(enable = "avx2")]
5517    fn test_mm_i32gather_epi32() {
5518        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5519        // A multiplier of 4 is word-addressing
5520        let r = unsafe { _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5521        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5522    }
5523
5524    #[simd_test(enable = "avx2")]
5525    fn test_mm_mask_i32gather_epi32() {
5526        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5527        // A multiplier of 4 is word-addressing
5528        let r = unsafe {
5529            _mm_mask_i32gather_epi32::<4>(
5530                _mm_set1_epi32(256),
5531                arr.as_ptr(),
5532                _mm_setr_epi32(0, 16, 64, 96),
5533                _mm_setr_epi32(-1, -1, -1, 0),
5534            )
5535        };
5536        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5537    }
5538
5539    #[simd_test(enable = "avx2")]
5540    fn test_mm256_i32gather_epi32() {
5541        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5542        // A multiplier of 4 is word-addressing
5543        let r = unsafe {
5544            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4))
5545        };
5546        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5547    }
5548
5549    #[simd_test(enable = "avx2")]
5550    fn test_mm256_mask_i32gather_epi32() {
5551        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5552        // A multiplier of 4 is word-addressing
5553        let r = unsafe {
5554            _mm256_mask_i32gather_epi32::<4>(
5555                _mm256_set1_epi32(256),
5556                arr.as_ptr(),
5557                _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5558                _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5559            )
5560        };
5561        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5562    }
5563
5564    #[simd_test(enable = "avx2")]
5565    fn test_mm_i32gather_ps() {
5566        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5567        // A multiplier of 4 is word-addressing for f32s
5568        let r = unsafe { _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5569        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5570    }
5571
5572    #[simd_test(enable = "avx2")]
5573    fn test_mm_mask_i32gather_ps() {
5574        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5575        // A multiplier of 4 is word-addressing for f32s
5576        let r = unsafe {
5577            _mm_mask_i32gather_ps::<4>(
5578                _mm_set1_ps(256.0),
5579                arr.as_ptr(),
5580                _mm_setr_epi32(0, 16, 64, 96),
5581                _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5582            )
5583        };
5584        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5585    }
5586
5587    #[simd_test(enable = "avx2")]
5588    fn test_mm256_i32gather_ps() {
5589        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5590        // A multiplier of 4 is word-addressing for f32s
5591        let r = unsafe {
5592            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4))
5593        };
5594        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5595    }
5596
5597    #[simd_test(enable = "avx2")]
5598    fn test_mm256_mask_i32gather_ps() {
5599        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5600        // A multiplier of 4 is word-addressing for f32s
5601        let r = unsafe {
5602            _mm256_mask_i32gather_ps::<4>(
5603                _mm256_set1_ps(256.0),
5604                arr.as_ptr(),
5605                _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5606                _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5607            )
5608        };
5609        assert_eq_m256(
5610            r,
5611            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5612        );
5613    }
5614
5615    #[simd_test(enable = "avx2")]
5616    fn test_mm_i32gather_epi64() {
5617        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5618        // A multiplier of 8 is word-addressing for i64s
5619        let r = unsafe { _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)) };
5620        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5621    }
5622
5623    #[simd_test(enable = "avx2")]
5624    fn test_mm_mask_i32gather_epi64() {
5625        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5626        // A multiplier of 8 is word-addressing for i64s
5627        let r = unsafe {
5628            _mm_mask_i32gather_epi64::<8>(
5629                _mm_set1_epi64x(256),
5630                arr.as_ptr(),
5631                _mm_setr_epi32(16, 16, 16, 16),
5632                _mm_setr_epi64x(-1, 0),
5633            )
5634        };
5635        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5636    }
5637
5638    #[simd_test(enable = "avx2")]
5639    fn test_mm256_i32gather_epi64() {
5640        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5641        // A multiplier of 8 is word-addressing for i64s
5642        let r = unsafe { _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5643        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5644    }
5645
5646    #[simd_test(enable = "avx2")]
5647    fn test_mm256_mask_i32gather_epi64() {
5648        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5649        // A multiplier of 8 is word-addressing for i64s
5650        let r = unsafe {
5651            _mm256_mask_i32gather_epi64::<8>(
5652                _mm256_set1_epi64x(256),
5653                arr.as_ptr(),
5654                _mm_setr_epi32(0, 16, 64, 96),
5655                _mm256_setr_epi64x(-1, -1, -1, 0),
5656            )
5657        };
5658        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5659    }
5660
5661    #[simd_test(enable = "avx2")]
5662    fn test_mm_i32gather_pd() {
5663        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5664        // A multiplier of 8 is word-addressing for f64s
5665        let r = unsafe { _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)) };
5666        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5667    }
5668
5669    #[simd_test(enable = "avx2")]
5670    fn test_mm_mask_i32gather_pd() {
5671        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5672        // A multiplier of 8 is word-addressing for f64s
5673        let r = unsafe {
5674            _mm_mask_i32gather_pd::<8>(
5675                _mm_set1_pd(256.0),
5676                arr.as_ptr(),
5677                _mm_setr_epi32(16, 16, 16, 16),
5678                _mm_setr_pd(-1.0, 0.0),
5679            )
5680        };
5681        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5682    }
5683
5684    #[simd_test(enable = "avx2")]
5685    fn test_mm256_i32gather_pd() {
5686        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5687        // A multiplier of 8 is word-addressing for f64s
5688        let r = unsafe { _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)) };
5689        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5690    }
5691
5692    #[simd_test(enable = "avx2")]
5693    fn test_mm256_mask_i32gather_pd() {
5694        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5695        // A multiplier of 8 is word-addressing for f64s
5696        let r = unsafe {
5697            _mm256_mask_i32gather_pd::<8>(
5698                _mm256_set1_pd(256.0),
5699                arr.as_ptr(),
5700                _mm_setr_epi32(0, 16, 64, 96),
5701                _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5702            )
5703        };
5704        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5705    }
5706
5707    #[simd_test(enable = "avx2")]
5708    fn test_mm_i64gather_epi32() {
5709        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5710        // A multiplier of 4 is word-addressing
5711        let r = unsafe { _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5712        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5713    }
5714
5715    #[simd_test(enable = "avx2")]
5716    fn test_mm_mask_i64gather_epi32() {
5717        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5718        // A multiplier of 4 is word-addressing
5719        let r = unsafe {
5720            _mm_mask_i64gather_epi32::<4>(
5721                _mm_set1_epi32(256),
5722                arr.as_ptr(),
5723                _mm_setr_epi64x(0, 16),
5724                _mm_setr_epi32(-1, 0, -1, 0),
5725            )
5726        };
5727        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5728    }
5729
5730    #[simd_test(enable = "avx2")]
5731    fn test_mm256_i64gather_epi32() {
5732        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5733        // A multiplier of 4 is word-addressing
5734        let r =
5735            unsafe { _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5736        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5737    }
5738
5739    #[simd_test(enable = "avx2")]
5740    fn test_mm256_mask_i64gather_epi32() {
5741        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5742        // A multiplier of 4 is word-addressing
5743        let r = unsafe {
5744            _mm256_mask_i64gather_epi32::<4>(
5745                _mm_set1_epi32(256),
5746                arr.as_ptr(),
5747                _mm256_setr_epi64x(0, 16, 64, 96),
5748                _mm_setr_epi32(-1, -1, -1, 0),
5749            )
5750        };
5751        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5752    }
5753
5754    #[simd_test(enable = "avx2")]
5755    fn test_mm_i64gather_ps() {
5756        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5757        // A multiplier of 4 is word-addressing for f32s
5758        let r = unsafe { _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5759        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5760    }
5761
5762    #[simd_test(enable = "avx2")]
5763    fn test_mm_mask_i64gather_ps() {
5764        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5765        // A multiplier of 4 is word-addressing for f32s
5766        let r = unsafe {
5767            _mm_mask_i64gather_ps::<4>(
5768                _mm_set1_ps(256.0),
5769                arr.as_ptr(),
5770                _mm_setr_epi64x(0, 16),
5771                _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5772            )
5773        };
5774        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5775    }
5776
5777    #[simd_test(enable = "avx2")]
5778    fn test_mm256_i64gather_ps() {
5779        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5780        // A multiplier of 4 is word-addressing for f32s
5781        let r =
5782            unsafe { _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5783        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5784    }
5785
5786    #[simd_test(enable = "avx2")]
5787    fn test_mm256_mask_i64gather_ps() {
5788        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5789        // A multiplier of 4 is word-addressing for f32s
5790        let r = unsafe {
5791            _mm256_mask_i64gather_ps::<4>(
5792                _mm_set1_ps(256.0),
5793                arr.as_ptr(),
5794                _mm256_setr_epi64x(0, 16, 64, 96),
5795                _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5796            )
5797        };
5798        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5799    }
5800
5801    #[simd_test(enable = "avx2")]
5802    fn test_mm_i64gather_epi64() {
5803        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5804        // A multiplier of 8 is word-addressing for i64s
5805        let r = unsafe { _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5806        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5807    }
5808
5809    #[simd_test(enable = "avx2")]
5810    fn test_mm_mask_i64gather_epi64() {
5811        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5812        // A multiplier of 8 is word-addressing for i64s
5813        let r = unsafe {
5814            _mm_mask_i64gather_epi64::<8>(
5815                _mm_set1_epi64x(256),
5816                arr.as_ptr(),
5817                _mm_setr_epi64x(16, 16),
5818                _mm_setr_epi64x(-1, 0),
5819            )
5820        };
5821        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5822    }
5823
5824    #[simd_test(enable = "avx2")]
5825    fn test_mm256_i64gather_epi64() {
5826        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5827        // A multiplier of 8 is word-addressing for i64s
5828        let r =
5829            unsafe { _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5830        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5831    }
5832
5833    #[simd_test(enable = "avx2")]
5834    fn test_mm256_mask_i64gather_epi64() {
5835        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5836        // A multiplier of 8 is word-addressing for i64s
5837        let r = unsafe {
5838            _mm256_mask_i64gather_epi64::<8>(
5839                _mm256_set1_epi64x(256),
5840                arr.as_ptr(),
5841                _mm256_setr_epi64x(0, 16, 64, 96),
5842                _mm256_setr_epi64x(-1, -1, -1, 0),
5843            )
5844        };
5845        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5846    }
5847
5848    #[simd_test(enable = "avx2")]
5849    fn test_mm_i64gather_pd() {
5850        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5851        // A multiplier of 8 is word-addressing for f64s
5852        let r = unsafe { _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)) };
5853        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5854    }
5855
5856    #[simd_test(enable = "avx2")]
5857    fn test_mm_mask_i64gather_pd() {
5858        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5859        // A multiplier of 8 is word-addressing for f64s
5860        let r = unsafe {
5861            _mm_mask_i64gather_pd::<8>(
5862                _mm_set1_pd(256.0),
5863                arr.as_ptr(),
5864                _mm_setr_epi64x(16, 16),
5865                _mm_setr_pd(-1.0, 0.0),
5866            )
5867        };
5868        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5869    }
5870
5871    #[simd_test(enable = "avx2")]
5872    fn test_mm256_i64gather_pd() {
5873        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5874        // A multiplier of 8 is word-addressing for f64s
5875        let r =
5876            unsafe { _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)) };
5877        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5878    }
5879
5880    #[simd_test(enable = "avx2")]
5881    fn test_mm256_mask_i64gather_pd() {
5882        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5883        // A multiplier of 8 is word-addressing for f64s
5884        let r = unsafe {
5885            _mm256_mask_i64gather_pd::<8>(
5886                _mm256_set1_pd(256.0),
5887                arr.as_ptr(),
5888                _mm256_setr_epi64x(0, 16, 64, 96),
5889                _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5890            )
5891        };
5892        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5893    }
5894
5895    #[simd_test(enable = "avx2")]
5896    const fn test_mm256_extract_epi8() {
5897        #[rustfmt::skip]
5898        let a = _mm256_setr_epi8(
5899            -1, 1, 2, 3, 4, 5, 6, 7,
5900            8, 9, 10, 11, 12, 13, 14, 15,
5901            16, 17, 18, 19, 20, 21, 22, 23,
5902            24, 25, 26, 27, 28, 29, 30, 31
5903        );
5904        let r1 = _mm256_extract_epi8::<0>(a);
5905        let r2 = _mm256_extract_epi8::<3>(a);
5906        assert_eq!(r1, 0xFF);
5907        assert_eq!(r2, 3);
5908    }
5909
5910    #[simd_test(enable = "avx2")]
5911    const fn test_mm256_extract_epi16() {
5912        #[rustfmt::skip]
5913        let a = _mm256_setr_epi16(
5914            -1, 1, 2, 3, 4, 5, 6, 7,
5915            8, 9, 10, 11, 12, 13, 14, 15,
5916        );
5917        let r1 = _mm256_extract_epi16::<0>(a);
5918        let r2 = _mm256_extract_epi16::<3>(a);
5919        assert_eq!(r1, 0xFFFF);
5920        assert_eq!(r2, 3);
5921    }
5922}