core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
35    unsafe {
36        let a = a.as_i32x8();
37        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
38        transmute(r)
39    }
40}
41
42/// Computes the absolute values of packed 16-bit integers in `a`.
43///
44/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
45#[inline]
46#[target_feature(enable = "avx2")]
47#[cfg_attr(test, assert_instr(vpabsw))]
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
50    unsafe {
51        let a = a.as_i16x16();
52        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
53        transmute(r)
54    }
55}
56
57/// Computes the absolute values of packed 8-bit integers in `a`.
58///
59/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
60#[inline]
61#[target_feature(enable = "avx2")]
62#[cfg_attr(test, assert_instr(vpabsb))]
63#[stable(feature = "simd_x86", since = "1.27.0")]
64pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
65    unsafe {
66        let a = a.as_i8x32();
67        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
68        transmute(r)
69    }
70}
71
72/// Adds packed 64-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
75#[inline]
76#[target_feature(enable = "avx2")]
77#[cfg_attr(test, assert_instr(vpaddq))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
80    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
81}
82
83/// Adds packed 32-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
86#[inline]
87#[target_feature(enable = "avx2")]
88#[cfg_attr(test, assert_instr(vpaddd))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
91    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
92}
93
94/// Adds packed 16-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
97#[inline]
98#[target_feature(enable = "avx2")]
99#[cfg_attr(test, assert_instr(vpaddw))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
102    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
103}
104
105/// Adds packed 8-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
108#[inline]
109#[target_feature(enable = "avx2")]
110#[cfg_attr(test, assert_instr(vpaddb))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
113    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
119#[inline]
120#[target_feature(enable = "avx2")]
121#[cfg_attr(test, assert_instr(vpaddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
130#[inline]
131#[target_feature(enable = "avx2")]
132#[cfg_attr(test, assert_instr(vpaddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
141#[inline]
142#[target_feature(enable = "avx2")]
143#[cfg_attr(test, assert_instr(vpaddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
152#[inline]
153#[target_feature(enable = "avx2")]
154#[cfg_attr(test, assert_instr(vpaddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
158}
159
160/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
161/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
162///
163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
164#[inline]
165#[target_feature(enable = "avx2")]
166#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
167#[rustc_legacy_const_generics(2)]
168#[stable(feature = "simd_x86", since = "1.27.0")]
169pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
170    static_assert_uimm_bits!(IMM8, 8);
171
172    // If palignr is shifting the pair of vectors more than the size of two
173    // lanes, emit zero.
174    if IMM8 >= 32 {
175        return _mm256_setzero_si256();
176    }
177    // If palignr is shifting the pair of input vectors more than one lane,
178    // but less than two lanes, convert to shifting in zeroes.
179    let (a, b) = if IMM8 > 16 {
180        (_mm256_setzero_si256(), a)
181    } else {
182        (a, b)
183    };
184    unsafe {
185        if IMM8 == 16 {
186            return transmute(a);
187        }
188    }
189    const fn mask(shift: u32, i: u32) -> u32 {
190        let shift = shift % 16;
191        let mod_i = i % 16;
192        if mod_i < (16 - shift) {
193            i + shift
194        } else {
195            i + 16 + shift
196        }
197    }
198
199    unsafe {
200        let r: i8x32 = simd_shuffle!(
201            b.as_i8x32(),
202            a.as_i8x32(),
203            [
204                mask(IMM8 as u32, 0),
205                mask(IMM8 as u32, 1),
206                mask(IMM8 as u32, 2),
207                mask(IMM8 as u32, 3),
208                mask(IMM8 as u32, 4),
209                mask(IMM8 as u32, 5),
210                mask(IMM8 as u32, 6),
211                mask(IMM8 as u32, 7),
212                mask(IMM8 as u32, 8),
213                mask(IMM8 as u32, 9),
214                mask(IMM8 as u32, 10),
215                mask(IMM8 as u32, 11),
216                mask(IMM8 as u32, 12),
217                mask(IMM8 as u32, 13),
218                mask(IMM8 as u32, 14),
219                mask(IMM8 as u32, 15),
220                mask(IMM8 as u32, 16),
221                mask(IMM8 as u32, 17),
222                mask(IMM8 as u32, 18),
223                mask(IMM8 as u32, 19),
224                mask(IMM8 as u32, 20),
225                mask(IMM8 as u32, 21),
226                mask(IMM8 as u32, 22),
227                mask(IMM8 as u32, 23),
228                mask(IMM8 as u32, 24),
229                mask(IMM8 as u32, 25),
230                mask(IMM8 as u32, 26),
231                mask(IMM8 as u32, 27),
232                mask(IMM8 as u32, 28),
233                mask(IMM8 as u32, 29),
234                mask(IMM8 as u32, 30),
235                mask(IMM8 as u32, 31),
236            ],
237        );
238        transmute(r)
239    }
240}
241
242/// Computes the bitwise AND of 256 bits (representing integer data)
243/// in `a` and `b`.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
246#[inline]
247#[target_feature(enable = "avx2")]
248#[cfg_attr(test, assert_instr(vandps))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
251    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
252}
253
254/// Computes the bitwise NOT of 256 bits (representing integer data)
255/// in `a` and then AND with `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandnps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
263    unsafe {
264        let all_ones = _mm256_set1_epi8(-1);
265        transmute(simd_and(
266            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
267            b.as_i64x4(),
268        ))
269    }
270}
271
272/// Averages packed unsigned 16-bit integers in `a` and `b`.
273///
274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
275#[inline]
276#[target_feature(enable = "avx2")]
277#[cfg_attr(test, assert_instr(vpavgw))]
278#[stable(feature = "simd_x86", since = "1.27.0")]
279pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
280    unsafe {
281        let a = simd_cast::<_, u32x16>(a.as_u16x16());
282        let b = simd_cast::<_, u32x16>(b.as_u16x16());
283        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
284        transmute(simd_cast::<_, u16x16>(r))
285    }
286}
287
288/// Averages packed unsigned 8-bit integers in `a` and `b`.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
291#[inline]
292#[target_feature(enable = "avx2")]
293#[cfg_attr(test, assert_instr(vpavgb))]
294#[stable(feature = "simd_x86", since = "1.27.0")]
295pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
296    unsafe {
297        let a = simd_cast::<_, u16x32>(a.as_u8x32());
298        let b = simd_cast::<_, u16x32>(b.as_u8x32());
299        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
300        transmute(simd_cast::<_, u8x32>(r))
301    }
302}
303
304/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
307#[inline]
308#[target_feature(enable = "avx2")]
309#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
310#[rustc_legacy_const_generics(2)]
311#[stable(feature = "simd_x86", since = "1.27.0")]
312pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
313    static_assert_uimm_bits!(IMM4, 4);
314    unsafe {
315        let a = a.as_i32x4();
316        let b = b.as_i32x4();
317        let r: i32x4 = simd_shuffle!(
318            a,
319            b,
320            [
321                [0, 4, 0, 4][IMM4 as usize & 0b11],
322                [1, 1, 5, 5][IMM4 as usize & 0b11],
323                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
324                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
325            ],
326        );
327        transmute(r)
328    }
329}
330
331/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
334#[inline]
335#[target_feature(enable = "avx2")]
336#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
337#[rustc_legacy_const_generics(2)]
338#[stable(feature = "simd_x86", since = "1.27.0")]
339pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
340    static_assert_uimm_bits!(IMM8, 8);
341    unsafe {
342        let a = a.as_i32x8();
343        let b = b.as_i32x8();
344        let r: i32x8 = simd_shuffle!(
345            a,
346            b,
347            [
348                [0, 8, 0, 8][IMM8 as usize & 0b11],
349                [1, 1, 9, 9][IMM8 as usize & 0b11],
350                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
351                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
352                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
353                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
354                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
355                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
356            ],
357        );
358        transmute(r)
359    }
360}
361
362/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
363///
364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
365#[inline]
366#[target_feature(enable = "avx2")]
367#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
368#[rustc_legacy_const_generics(2)]
369#[stable(feature = "simd_x86", since = "1.27.0")]
370pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
371    static_assert_uimm_bits!(IMM8, 8);
372    unsafe {
373        let a = a.as_i16x16();
374        let b = b.as_i16x16();
375
376        let r: i16x16 = simd_shuffle!(
377            a,
378            b,
379            [
380                [0, 16, 0, 16][IMM8 as usize & 0b11],
381                [1, 1, 17, 17][IMM8 as usize & 0b11],
382                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
383                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
384                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
385                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
386                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
387                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
388                [8, 24, 8, 24][IMM8 as usize & 0b11],
389                [9, 9, 25, 25][IMM8 as usize & 0b11],
390                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
391                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
392                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
393                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
394                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
395                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
396            ],
397        );
398        transmute(r)
399    }
400}
401
402/// Blends packed 8-bit integers from `a` and `b` using `mask`.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
405#[inline]
406#[target_feature(enable = "avx2")]
407#[cfg_attr(test, assert_instr(vpblendvb))]
408#[stable(feature = "simd_x86", since = "1.27.0")]
409pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
410    unsafe {
411        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
412        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
413    }
414}
415
416/// Broadcasts the low packed 8-bit integer from `a` to all elements of
417/// the 128-bit returned value.
418///
419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
420#[inline]
421#[target_feature(enable = "avx2")]
422#[cfg_attr(test, assert_instr(vpbroadcastb))]
423#[stable(feature = "simd_x86", since = "1.27.0")]
424pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
425    unsafe {
426        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
427        transmute::<i8x16, _>(ret)
428    }
429}
430
431/// Broadcasts the low packed 8-bit integer from `a` to all elements of
432/// the 256-bit returned value.
433///
434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
435#[inline]
436#[target_feature(enable = "avx2")]
437#[cfg_attr(test, assert_instr(vpbroadcastb))]
438#[stable(feature = "simd_x86", since = "1.27.0")]
439pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
440    unsafe {
441        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
442        transmute::<i8x32, _>(ret)
443    }
444}
445
446// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
447// often compiled to `vbroadcastss`.
448/// Broadcasts the low packed 32-bit integer from `a` to all elements of
449/// the 128-bit returned value.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
452#[inline]
453#[target_feature(enable = "avx2")]
454#[cfg_attr(test, assert_instr(vbroadcastss))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
457    unsafe {
458        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
459        transmute::<i32x4, _>(ret)
460    }
461}
462
463// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
464// often compiled to `vbroadcastss`.
465/// Broadcasts the low packed 32-bit integer from `a` to all elements of
466/// the 256-bit returned value.
467///
468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
469#[inline]
470#[target_feature(enable = "avx2")]
471#[cfg_attr(test, assert_instr(vbroadcastss))]
472#[stable(feature = "simd_x86", since = "1.27.0")]
473pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
474    unsafe {
475        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
476        transmute::<i32x8, _>(ret)
477    }
478}
479
480/// Broadcasts the low packed 64-bit integer from `a` to all elements of
481/// the 128-bit returned value.
482///
483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
484#[inline]
485#[target_feature(enable = "avx2")]
486// Emits `vmovddup` instead of `vpbroadcastq`
487// See https://github.com/rust-lang/stdarch/issues/791
488#[cfg_attr(test, assert_instr(vmovddup))]
489#[stable(feature = "simd_x86", since = "1.27.0")]
490pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
491    unsafe {
492        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
493        transmute::<i64x2, _>(ret)
494    }
495}
496
497/// Broadcasts the low packed 64-bit integer from `a` to all elements of
498/// the 256-bit returned value.
499///
500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
501#[inline]
502#[target_feature(enable = "avx2")]
503#[cfg_attr(test, assert_instr(vbroadcastsd))]
504#[stable(feature = "simd_x86", since = "1.27.0")]
505pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
506    unsafe {
507        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
508        transmute::<i64x4, _>(ret)
509    }
510}
511
512/// Broadcasts the low double-precision (64-bit) floating-point element
513/// from `a` to all elements of the 128-bit returned value.
514///
515/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
516#[inline]
517#[target_feature(enable = "avx2")]
518#[cfg_attr(test, assert_instr(vmovddup))]
519#[stable(feature = "simd_x86", since = "1.27.0")]
520pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
521    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
522}
523
524/// Broadcasts the low double-precision (64-bit) floating-point element
525/// from `a` to all elements of the 256-bit returned value.
526///
527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
528#[inline]
529#[target_feature(enable = "avx2")]
530#[cfg_attr(test, assert_instr(vbroadcastsd))]
531#[stable(feature = "simd_x86", since = "1.27.0")]
532pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
533    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
534}
535
536/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
537/// the 256-bit returned value.
538///
539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
540#[inline]
541#[target_feature(enable = "avx2")]
542#[stable(feature = "simd_x86_updates", since = "1.82.0")]
543pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
544    unsafe {
545        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
546        transmute::<i64x4, _>(ret)
547    }
548}
549
550// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
551// `vbroadcastf128`.
552/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
553/// the 256-bit returned value.
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
556#[inline]
557#[target_feature(enable = "avx2")]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
560    unsafe {
561        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
562        transmute::<i64x4, _>(ret)
563    }
564}
565
566/// Broadcasts the low single-precision (32-bit) floating-point element
567/// from `a` to all elements of the 128-bit returned value.
568///
569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
570#[inline]
571#[target_feature(enable = "avx2")]
572#[cfg_attr(test, assert_instr(vbroadcastss))]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
575    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
576}
577
578/// Broadcasts the low single-precision (32-bit) floating-point element
579/// from `a` to all elements of the 256-bit returned value.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
582#[inline]
583#[target_feature(enable = "avx2")]
584#[cfg_attr(test, assert_instr(vbroadcastss))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
587    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
588}
589
590/// Broadcasts the low packed 16-bit integer from a to all elements of
591/// the 128-bit returned value
592///
593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
594#[inline]
595#[target_feature(enable = "avx2")]
596#[cfg_attr(test, assert_instr(vpbroadcastw))]
597#[stable(feature = "simd_x86", since = "1.27.0")]
598pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
599    unsafe {
600        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
601        transmute::<i16x8, _>(ret)
602    }
603}
604
605/// Broadcasts the low packed 16-bit integer from a to all elements of
606/// the 256-bit returned value
607///
608/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
609#[inline]
610#[target_feature(enable = "avx2")]
611#[cfg_attr(test, assert_instr(vpbroadcastw))]
612#[stable(feature = "simd_x86", since = "1.27.0")]
613pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
614    unsafe {
615        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
616        transmute::<i16x16, _>(ret)
617    }
618}
619
620/// Compares packed 64-bit integers in `a` and `b` for equality.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
623#[inline]
624#[target_feature(enable = "avx2")]
625#[cfg_attr(test, assert_instr(vpcmpeqq))]
626#[stable(feature = "simd_x86", since = "1.27.0")]
627pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
628    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
629}
630
631/// Compares packed 32-bit integers in `a` and `b` for equality.
632///
633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
634#[inline]
635#[target_feature(enable = "avx2")]
636#[cfg_attr(test, assert_instr(vpcmpeqd))]
637#[stable(feature = "simd_x86", since = "1.27.0")]
638pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
639    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
640}
641
642/// Compares packed 16-bit integers in `a` and `b` for equality.
643///
644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
645#[inline]
646#[target_feature(enable = "avx2")]
647#[cfg_attr(test, assert_instr(vpcmpeqw))]
648#[stable(feature = "simd_x86", since = "1.27.0")]
649pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
650    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
651}
652
653/// Compares packed 8-bit integers in `a` and `b` for equality.
654///
655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
656#[inline]
657#[target_feature(enable = "avx2")]
658#[cfg_attr(test, assert_instr(vpcmpeqb))]
659#[stable(feature = "simd_x86", since = "1.27.0")]
660pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
661    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
662}
663
664/// Compares packed 64-bit integers in `a` and `b` for greater-than.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
667#[inline]
668#[target_feature(enable = "avx2")]
669#[cfg_attr(test, assert_instr(vpcmpgtq))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
672    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
673}
674
675/// Compares packed 32-bit integers in `a` and `b` for greater-than.
676///
677/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
678#[inline]
679#[target_feature(enable = "avx2")]
680#[cfg_attr(test, assert_instr(vpcmpgtd))]
681#[stable(feature = "simd_x86", since = "1.27.0")]
682pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
683    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
684}
685
686/// Compares packed 16-bit integers in `a` and `b` for greater-than.
687///
688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
689#[inline]
690#[target_feature(enable = "avx2")]
691#[cfg_attr(test, assert_instr(vpcmpgtw))]
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
694    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
695}
696
697/// Compares packed 8-bit integers in `a` and `b` for greater-than.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
700#[inline]
701#[target_feature(enable = "avx2")]
702#[cfg_attr(test, assert_instr(vpcmpgtb))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
705    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
706}
707
708/// Sign-extend 16-bit integers to 32-bit integers.
709///
710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
711#[inline]
712#[target_feature(enable = "avx2")]
713#[cfg_attr(test, assert_instr(vpmovsxwd))]
714#[stable(feature = "simd_x86", since = "1.27.0")]
715pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
716    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
717}
718
719/// Sign-extend 16-bit integers to 64-bit integers.
720///
721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
722#[inline]
723#[target_feature(enable = "avx2")]
724#[cfg_attr(test, assert_instr(vpmovsxwq))]
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
727    unsafe {
728        let a = a.as_i16x8();
729        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
730        transmute::<i64x4, _>(simd_cast(v64))
731    }
732}
733
734/// Sign-extend 32-bit integers to 64-bit integers.
735///
736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
737#[inline]
738#[target_feature(enable = "avx2")]
739#[cfg_attr(test, assert_instr(vpmovsxdq))]
740#[stable(feature = "simd_x86", since = "1.27.0")]
741pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
742    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
743}
744
745/// Sign-extend 8-bit integers to 16-bit integers.
746///
747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
748#[inline]
749#[target_feature(enable = "avx2")]
750#[cfg_attr(test, assert_instr(vpmovsxbw))]
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
753    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
754}
755
756/// Sign-extend 8-bit integers to 32-bit integers.
757///
758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
759#[inline]
760#[target_feature(enable = "avx2")]
761#[cfg_attr(test, assert_instr(vpmovsxbd))]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
764    unsafe {
765        let a = a.as_i8x16();
766        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
767        transmute::<i32x8, _>(simd_cast(v64))
768    }
769}
770
771/// Sign-extend 8-bit integers to 64-bit integers.
772///
773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
774#[inline]
775#[target_feature(enable = "avx2")]
776#[cfg_attr(test, assert_instr(vpmovsxbq))]
777#[stable(feature = "simd_x86", since = "1.27.0")]
778pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
779    unsafe {
780        let a = a.as_i8x16();
781        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
782        transmute::<i64x4, _>(simd_cast(v32))
783    }
784}
785
786/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
787/// integers, and stores the results in `dst`.
788///
789/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
790#[inline]
791#[target_feature(enable = "avx2")]
792#[cfg_attr(test, assert_instr(vpmovzxwd))]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
795    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
796}
797
798/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
799/// integers. The upper four elements of `a` are unused.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
802#[inline]
803#[target_feature(enable = "avx2")]
804#[cfg_attr(test, assert_instr(vpmovzxwq))]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
807    unsafe {
808        let a = a.as_u16x8();
809        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
810        transmute::<i64x4, _>(simd_cast(v64))
811    }
812}
813
814/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
815///
816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
817#[inline]
818#[target_feature(enable = "avx2")]
819#[cfg_attr(test, assert_instr(vpmovzxdq))]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
822    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
823}
824
825/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
826///
827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
828#[inline]
829#[target_feature(enable = "avx2")]
830#[cfg_attr(test, assert_instr(vpmovzxbw))]
831#[stable(feature = "simd_x86", since = "1.27.0")]
832pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
833    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
834}
835
836/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
837/// integers. The upper eight elements of `a` are unused.
838///
839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
840#[inline]
841#[target_feature(enable = "avx2")]
842#[cfg_attr(test, assert_instr(vpmovzxbd))]
843#[stable(feature = "simd_x86", since = "1.27.0")]
844pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
845    unsafe {
846        let a = a.as_u8x16();
847        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
848        transmute::<i32x8, _>(simd_cast(v64))
849    }
850}
851
852/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
853/// integers. The upper twelve elements of `a` are unused.
854///
855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
856#[inline]
857#[target_feature(enable = "avx2")]
858#[cfg_attr(test, assert_instr(vpmovzxbq))]
859#[stable(feature = "simd_x86", since = "1.27.0")]
860pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
861    unsafe {
862        let a = a.as_u8x16();
863        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
864        transmute::<i64x4, _>(simd_cast(v32))
865    }
866}
867
868/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
869///
870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
871#[inline]
872#[target_feature(enable = "avx2")]
873#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
874#[rustc_legacy_const_generics(1)]
875#[stable(feature = "simd_x86", since = "1.27.0")]
876pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
877    static_assert_uimm_bits!(IMM1, 1);
878    unsafe {
879        let a = a.as_i64x4();
880        let b = i64x4::ZERO;
881        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
882        transmute(dst)
883    }
884}
885
886/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
889#[inline]
890#[target_feature(enable = "avx2")]
891#[cfg_attr(test, assert_instr(vphaddw))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
894    let a = a.as_i16x16();
895    let b = b.as_i16x16();
896    unsafe {
897        let even: i16x16 = simd_shuffle!(
898            a,
899            b,
900            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
901        );
902        let odd: i16x16 = simd_shuffle!(
903            a,
904            b,
905            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
906        );
907        simd_add(even, odd).as_m256i()
908    }
909}
910
911/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
912///
913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
914#[inline]
915#[target_feature(enable = "avx2")]
916#[cfg_attr(test, assert_instr(vphaddd))]
917#[stable(feature = "simd_x86", since = "1.27.0")]
918pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
919    let a = a.as_i32x8();
920    let b = b.as_i32x8();
921    unsafe {
922        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
923        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
924        simd_add(even, odd).as_m256i()
925    }
926}
927
928/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
929/// using saturation.
930///
931/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
932#[inline]
933#[target_feature(enable = "avx2")]
934#[cfg_attr(test, assert_instr(vphaddsw))]
935#[stable(feature = "simd_x86", since = "1.27.0")]
936pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
937    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
938}
939
940/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
941///
942/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
943#[inline]
944#[target_feature(enable = "avx2")]
945#[cfg_attr(test, assert_instr(vphsubw))]
946#[stable(feature = "simd_x86", since = "1.27.0")]
947pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
948    let a = a.as_i16x16();
949    let b = b.as_i16x16();
950    unsafe {
951        let even: i16x16 = simd_shuffle!(
952            a,
953            b,
954            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
955        );
956        let odd: i16x16 = simd_shuffle!(
957            a,
958            b,
959            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
960        );
961        simd_sub(even, odd).as_m256i()
962    }
963}
964
965/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
968#[inline]
969#[target_feature(enable = "avx2")]
970#[cfg_attr(test, assert_instr(vphsubd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
973    let a = a.as_i32x8();
974    let b = b.as_i32x8();
975    unsafe {
976        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
977        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
978        simd_sub(even, odd).as_m256i()
979    }
980}
981
982/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
983/// using saturation.
984///
985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
986#[inline]
987#[target_feature(enable = "avx2")]
988#[cfg_attr(test, assert_instr(vphsubsw))]
989#[stable(feature = "simd_x86", since = "1.27.0")]
990pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
991    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
992}
993
994/// Returns values from `slice` at offsets determined by `offsets * scale`,
995/// where
996/// `scale` should be 1, 2, 4 or 8.
997///
998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
999#[inline]
1000#[target_feature(enable = "avx2")]
1001#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1002#[rustc_legacy_const_generics(2)]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1005    slice: *const i32,
1006    offsets: __m128i,
1007) -> __m128i {
1008    static_assert_imm8_scale!(SCALE);
1009    let zero = i32x4::ZERO;
1010    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1011    let offsets = offsets.as_i32x4();
1012    let slice = slice as *const i8;
1013    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1014    transmute(r)
1015}
1016
1017/// Returns values from `slice` at offsets determined by `offsets * scale`,
1018/// where
1019/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1020/// that position instead.
1021///
1022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1023#[inline]
1024#[target_feature(enable = "avx2")]
1025#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1026#[rustc_legacy_const_generics(4)]
1027#[stable(feature = "simd_x86", since = "1.27.0")]
1028pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1029    src: __m128i,
1030    slice: *const i32,
1031    offsets: __m128i,
1032    mask: __m128i,
1033) -> __m128i {
1034    static_assert_imm8_scale!(SCALE);
1035    let src = src.as_i32x4();
1036    let mask = mask.as_i32x4();
1037    let offsets = offsets.as_i32x4();
1038    let slice = slice as *const i8;
1039    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1040    transmute(r)
1041}
1042
1043/// Returns values from `slice` at offsets determined by `offsets * scale`,
1044/// where
1045/// `scale` should be 1, 2, 4 or 8.
1046///
1047/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1048#[inline]
1049#[target_feature(enable = "avx2")]
1050#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1051#[rustc_legacy_const_generics(2)]
1052#[stable(feature = "simd_x86", since = "1.27.0")]
1053pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1054    slice: *const i32,
1055    offsets: __m256i,
1056) -> __m256i {
1057    static_assert_imm8_scale!(SCALE);
1058    let zero = i32x8::ZERO;
1059    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1060    let offsets = offsets.as_i32x8();
1061    let slice = slice as *const i8;
1062    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1063    transmute(r)
1064}
1065
1066/// Returns values from `slice` at offsets determined by `offsets * scale`,
1067/// where
1068/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1069/// that position instead.
1070///
1071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1072#[inline]
1073#[target_feature(enable = "avx2")]
1074#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1075#[rustc_legacy_const_generics(4)]
1076#[stable(feature = "simd_x86", since = "1.27.0")]
1077pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1078    src: __m256i,
1079    slice: *const i32,
1080    offsets: __m256i,
1081    mask: __m256i,
1082) -> __m256i {
1083    static_assert_imm8_scale!(SCALE);
1084    let src = src.as_i32x8();
1085    let mask = mask.as_i32x8();
1086    let offsets = offsets.as_i32x8();
1087    let slice = slice as *const i8;
1088    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1089    transmute(r)
1090}
1091
1092/// Returns values from `slice` at offsets determined by `offsets * scale`,
1093/// where
1094/// `scale` should be 1, 2, 4 or 8.
1095///
1096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1097#[inline]
1098#[target_feature(enable = "avx2")]
1099#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1100#[rustc_legacy_const_generics(2)]
1101#[stable(feature = "simd_x86", since = "1.27.0")]
1102pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1103    static_assert_imm8_scale!(SCALE);
1104    let zero = _mm_setzero_ps();
1105    let neg_one = _mm_set1_ps(-1.0);
1106    let offsets = offsets.as_i32x4();
1107    let slice = slice as *const i8;
1108    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1109}
1110
1111/// Returns values from `slice` at offsets determined by `offsets * scale`,
1112/// where
1113/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1114/// that position instead.
1115///
1116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1117#[inline]
1118#[target_feature(enable = "avx2")]
1119#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1120#[rustc_legacy_const_generics(4)]
1121#[stable(feature = "simd_x86", since = "1.27.0")]
1122pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1123    src: __m128,
1124    slice: *const f32,
1125    offsets: __m128i,
1126    mask: __m128,
1127) -> __m128 {
1128    static_assert_imm8_scale!(SCALE);
1129    let offsets = offsets.as_i32x4();
1130    let slice = slice as *const i8;
1131    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1132}
1133
1134/// Returns values from `slice` at offsets determined by `offsets * scale`,
1135/// where
1136/// `scale` should be 1, 2, 4 or 8.
1137///
1138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1139#[inline]
1140#[target_feature(enable = "avx2")]
1141#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1142#[rustc_legacy_const_generics(2)]
1143#[stable(feature = "simd_x86", since = "1.27.0")]
1144pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1145    static_assert_imm8_scale!(SCALE);
1146    let zero = _mm256_setzero_ps();
1147    let neg_one = _mm256_set1_ps(-1.0);
1148    let offsets = offsets.as_i32x8();
1149    let slice = slice as *const i8;
1150    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1151}
1152
1153/// Returns values from `slice` at offsets determined by `offsets * scale`,
1154/// where
1155/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1156/// that position instead.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1159#[inline]
1160#[target_feature(enable = "avx2")]
1161#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1162#[rustc_legacy_const_generics(4)]
1163#[stable(feature = "simd_x86", since = "1.27.0")]
1164pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1165    src: __m256,
1166    slice: *const f32,
1167    offsets: __m256i,
1168    mask: __m256,
1169) -> __m256 {
1170    static_assert_imm8_scale!(SCALE);
1171    let offsets = offsets.as_i32x8();
1172    let slice = slice as *const i8;
1173    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1174}
1175
1176/// Returns values from `slice` at offsets determined by `offsets * scale`,
1177/// where
1178/// `scale` should be 1, 2, 4 or 8.
1179///
1180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1181#[inline]
1182#[target_feature(enable = "avx2")]
1183#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1184#[rustc_legacy_const_generics(2)]
1185#[stable(feature = "simd_x86", since = "1.27.0")]
1186pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1187    slice: *const i64,
1188    offsets: __m128i,
1189) -> __m128i {
1190    static_assert_imm8_scale!(SCALE);
1191    let zero = i64x2::ZERO;
1192    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1193    let offsets = offsets.as_i32x4();
1194    let slice = slice as *const i8;
1195    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1196    transmute(r)
1197}
1198
1199/// Returns values from `slice` at offsets determined by `offsets * scale`,
1200/// where
1201/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1202/// that position instead.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1205#[inline]
1206#[target_feature(enable = "avx2")]
1207#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1208#[rustc_legacy_const_generics(4)]
1209#[stable(feature = "simd_x86", since = "1.27.0")]
1210pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1211    src: __m128i,
1212    slice: *const i64,
1213    offsets: __m128i,
1214    mask: __m128i,
1215) -> __m128i {
1216    static_assert_imm8_scale!(SCALE);
1217    let src = src.as_i64x2();
1218    let mask = mask.as_i64x2();
1219    let offsets = offsets.as_i32x4();
1220    let slice = slice as *const i8;
1221    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1222    transmute(r)
1223}
1224
1225/// Returns values from `slice` at offsets determined by `offsets * scale`,
1226/// where
1227/// `scale` should be 1, 2, 4 or 8.
1228///
1229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1230#[inline]
1231#[target_feature(enable = "avx2")]
1232#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1233#[rustc_legacy_const_generics(2)]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1236    slice: *const i64,
1237    offsets: __m128i,
1238) -> __m256i {
1239    static_assert_imm8_scale!(SCALE);
1240    let zero = i64x4::ZERO;
1241    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1242    let offsets = offsets.as_i32x4();
1243    let slice = slice as *const i8;
1244    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1245    transmute(r)
1246}
1247
1248/// Returns values from `slice` at offsets determined by `offsets * scale`,
1249/// where
1250/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1251/// that position instead.
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1254#[inline]
1255#[target_feature(enable = "avx2")]
1256#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1257#[rustc_legacy_const_generics(4)]
1258#[stable(feature = "simd_x86", since = "1.27.0")]
1259pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1260    src: __m256i,
1261    slice: *const i64,
1262    offsets: __m128i,
1263    mask: __m256i,
1264) -> __m256i {
1265    static_assert_imm8_scale!(SCALE);
1266    let src = src.as_i64x4();
1267    let mask = mask.as_i64x4();
1268    let offsets = offsets.as_i32x4();
1269    let slice = slice as *const i8;
1270    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1271    transmute(r)
1272}
1273
1274/// Returns values from `slice` at offsets determined by `offsets * scale`,
1275/// where
1276/// `scale` should be 1, 2, 4 or 8.
1277///
1278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1279#[inline]
1280#[target_feature(enable = "avx2")]
1281#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1282#[rustc_legacy_const_generics(2)]
1283#[stable(feature = "simd_x86", since = "1.27.0")]
1284pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1285    static_assert_imm8_scale!(SCALE);
1286    let zero = _mm_setzero_pd();
1287    let neg_one = _mm_set1_pd(-1.0);
1288    let offsets = offsets.as_i32x4();
1289    let slice = slice as *const i8;
1290    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1291}
1292
1293/// Returns values from `slice` at offsets determined by `offsets * scale`,
1294/// where
1295/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1296/// that position instead.
1297///
1298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1299#[inline]
1300#[target_feature(enable = "avx2")]
1301#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1302#[rustc_legacy_const_generics(4)]
1303#[stable(feature = "simd_x86", since = "1.27.0")]
1304pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1305    src: __m128d,
1306    slice: *const f64,
1307    offsets: __m128i,
1308    mask: __m128d,
1309) -> __m128d {
1310    static_assert_imm8_scale!(SCALE);
1311    let offsets = offsets.as_i32x4();
1312    let slice = slice as *const i8;
1313    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1314}
1315
1316/// Returns values from `slice` at offsets determined by `offsets * scale`,
1317/// where
1318/// `scale` should be 1, 2, 4 or 8.
1319///
1320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1321#[inline]
1322#[target_feature(enable = "avx2")]
1323#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1324#[rustc_legacy_const_generics(2)]
1325#[stable(feature = "simd_x86", since = "1.27.0")]
1326pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1327    slice: *const f64,
1328    offsets: __m128i,
1329) -> __m256d {
1330    static_assert_imm8_scale!(SCALE);
1331    let zero = _mm256_setzero_pd();
1332    let neg_one = _mm256_set1_pd(-1.0);
1333    let offsets = offsets.as_i32x4();
1334    let slice = slice as *const i8;
1335    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1336}
1337
1338/// Returns values from `slice` at offsets determined by `offsets * scale`,
1339/// where
1340/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1341/// that position instead.
1342///
1343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1344#[inline]
1345#[target_feature(enable = "avx2")]
1346#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1347#[rustc_legacy_const_generics(4)]
1348#[stable(feature = "simd_x86", since = "1.27.0")]
1349pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1350    src: __m256d,
1351    slice: *const f64,
1352    offsets: __m128i,
1353    mask: __m256d,
1354) -> __m256d {
1355    static_assert_imm8_scale!(SCALE);
1356    let offsets = offsets.as_i32x4();
1357    let slice = slice as *const i8;
1358    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1359}
1360
1361/// Returns values from `slice` at offsets determined by `offsets * scale`,
1362/// where
1363/// `scale` should be 1, 2, 4 or 8.
1364///
1365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1366#[inline]
1367#[target_feature(enable = "avx2")]
1368#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1369#[rustc_legacy_const_generics(2)]
1370#[stable(feature = "simd_x86", since = "1.27.0")]
1371pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1372    slice: *const i32,
1373    offsets: __m128i,
1374) -> __m128i {
1375    static_assert_imm8_scale!(SCALE);
1376    let zero = i32x4::ZERO;
1377    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1378    let offsets = offsets.as_i64x2();
1379    let slice = slice as *const i8;
1380    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1381    transmute(r)
1382}
1383
1384/// Returns values from `slice` at offsets determined by `offsets * scale`,
1385/// where
1386/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1387/// that position instead.
1388///
1389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1390#[inline]
1391#[target_feature(enable = "avx2")]
1392#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1393#[rustc_legacy_const_generics(4)]
1394#[stable(feature = "simd_x86", since = "1.27.0")]
1395pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1396    src: __m128i,
1397    slice: *const i32,
1398    offsets: __m128i,
1399    mask: __m128i,
1400) -> __m128i {
1401    static_assert_imm8_scale!(SCALE);
1402    let src = src.as_i32x4();
1403    let mask = mask.as_i32x4();
1404    let offsets = offsets.as_i64x2();
1405    let slice = slice as *const i8;
1406    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1407    transmute(r)
1408}
1409
1410/// Returns values from `slice` at offsets determined by `offsets * scale`,
1411/// where
1412/// `scale` should be 1, 2, 4 or 8.
1413///
1414/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1415#[inline]
1416#[target_feature(enable = "avx2")]
1417#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1418#[rustc_legacy_const_generics(2)]
1419#[stable(feature = "simd_x86", since = "1.27.0")]
1420pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1421    slice: *const i32,
1422    offsets: __m256i,
1423) -> __m128i {
1424    static_assert_imm8_scale!(SCALE);
1425    let zero = i32x4::ZERO;
1426    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1427    let offsets = offsets.as_i64x4();
1428    let slice = slice as *const i8;
1429    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1430    transmute(r)
1431}
1432
1433/// Returns values from `slice` at offsets determined by `offsets * scale`,
1434/// where
1435/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1436/// that position instead.
1437///
1438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1439#[inline]
1440#[target_feature(enable = "avx2")]
1441#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1442#[rustc_legacy_const_generics(4)]
1443#[stable(feature = "simd_x86", since = "1.27.0")]
1444pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1445    src: __m128i,
1446    slice: *const i32,
1447    offsets: __m256i,
1448    mask: __m128i,
1449) -> __m128i {
1450    static_assert_imm8_scale!(SCALE);
1451    let src = src.as_i32x4();
1452    let mask = mask.as_i32x4();
1453    let offsets = offsets.as_i64x4();
1454    let slice = slice as *const i8;
1455    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1456    transmute(r)
1457}
1458
1459/// Returns values from `slice` at offsets determined by `offsets * scale`,
1460/// where
1461/// `scale` should be 1, 2, 4 or 8.
1462///
1463/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1464#[inline]
1465#[target_feature(enable = "avx2")]
1466#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1467#[rustc_legacy_const_generics(2)]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1470    static_assert_imm8_scale!(SCALE);
1471    let zero = _mm_setzero_ps();
1472    let neg_one = _mm_set1_ps(-1.0);
1473    let offsets = offsets.as_i64x2();
1474    let slice = slice as *const i8;
1475    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1476}
1477
1478/// Returns values from `slice` at offsets determined by `offsets * scale`,
1479/// where
1480/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1481/// that position instead.
1482///
1483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1484#[inline]
1485#[target_feature(enable = "avx2")]
1486#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1487#[rustc_legacy_const_generics(4)]
1488#[stable(feature = "simd_x86", since = "1.27.0")]
1489pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1490    src: __m128,
1491    slice: *const f32,
1492    offsets: __m128i,
1493    mask: __m128,
1494) -> __m128 {
1495    static_assert_imm8_scale!(SCALE);
1496    let offsets = offsets.as_i64x2();
1497    let slice = slice as *const i8;
1498    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1499}
1500
1501/// Returns values from `slice` at offsets determined by `offsets * scale`,
1502/// where
1503/// `scale` should be 1, 2, 4 or 8.
1504///
1505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1506#[inline]
1507#[target_feature(enable = "avx2")]
1508#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1509#[rustc_legacy_const_generics(2)]
1510#[stable(feature = "simd_x86", since = "1.27.0")]
1511pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1512    static_assert_imm8_scale!(SCALE);
1513    let zero = _mm_setzero_ps();
1514    let neg_one = _mm_set1_ps(-1.0);
1515    let offsets = offsets.as_i64x4();
1516    let slice = slice as *const i8;
1517    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1518}
1519
1520/// Returns values from `slice` at offsets determined by `offsets * scale`,
1521/// where
1522/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1523/// that position instead.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1526#[inline]
1527#[target_feature(enable = "avx2")]
1528#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1529#[rustc_legacy_const_generics(4)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1532    src: __m128,
1533    slice: *const f32,
1534    offsets: __m256i,
1535    mask: __m128,
1536) -> __m128 {
1537    static_assert_imm8_scale!(SCALE);
1538    let offsets = offsets.as_i64x4();
1539    let slice = slice as *const i8;
1540    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1541}
1542
1543/// Returns values from `slice` at offsets determined by `offsets * scale`,
1544/// where
1545/// `scale` should be 1, 2, 4 or 8.
1546///
1547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1548#[inline]
1549#[target_feature(enable = "avx2")]
1550#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1551#[rustc_legacy_const_generics(2)]
1552#[stable(feature = "simd_x86", since = "1.27.0")]
1553pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1554    slice: *const i64,
1555    offsets: __m128i,
1556) -> __m128i {
1557    static_assert_imm8_scale!(SCALE);
1558    let zero = i64x2::ZERO;
1559    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1560    let slice = slice as *const i8;
1561    let offsets = offsets.as_i64x2();
1562    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1563    transmute(r)
1564}
1565
1566/// Returns values from `slice` at offsets determined by `offsets * scale`,
1567/// where
1568/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1569/// that position instead.
1570///
1571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1572#[inline]
1573#[target_feature(enable = "avx2")]
1574#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1575#[rustc_legacy_const_generics(4)]
1576#[stable(feature = "simd_x86", since = "1.27.0")]
1577pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1578    src: __m128i,
1579    slice: *const i64,
1580    offsets: __m128i,
1581    mask: __m128i,
1582) -> __m128i {
1583    static_assert_imm8_scale!(SCALE);
1584    let src = src.as_i64x2();
1585    let mask = mask.as_i64x2();
1586    let offsets = offsets.as_i64x2();
1587    let slice = slice as *const i8;
1588    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1589    transmute(r)
1590}
1591
1592/// Returns values from `slice` at offsets determined by `offsets * scale`,
1593/// where
1594/// `scale` should be 1, 2, 4 or 8.
1595///
1596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1597#[inline]
1598#[target_feature(enable = "avx2")]
1599#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1600#[rustc_legacy_const_generics(2)]
1601#[stable(feature = "simd_x86", since = "1.27.0")]
1602pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1603    slice: *const i64,
1604    offsets: __m256i,
1605) -> __m256i {
1606    static_assert_imm8_scale!(SCALE);
1607    let zero = i64x4::ZERO;
1608    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1609    let slice = slice as *const i8;
1610    let offsets = offsets.as_i64x4();
1611    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1612    transmute(r)
1613}
1614
1615/// Returns values from `slice` at offsets determined by `offsets * scale`,
1616/// where
1617/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1618/// that position instead.
1619///
1620/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1621#[inline]
1622#[target_feature(enable = "avx2")]
1623#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1624#[rustc_legacy_const_generics(4)]
1625#[stable(feature = "simd_x86", since = "1.27.0")]
1626pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1627    src: __m256i,
1628    slice: *const i64,
1629    offsets: __m256i,
1630    mask: __m256i,
1631) -> __m256i {
1632    static_assert_imm8_scale!(SCALE);
1633    let src = src.as_i64x4();
1634    let mask = mask.as_i64x4();
1635    let offsets = offsets.as_i64x4();
1636    let slice = slice as *const i8;
1637    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1638    transmute(r)
1639}
1640
1641/// Returns values from `slice` at offsets determined by `offsets * scale`,
1642/// where
1643/// `scale` should be 1, 2, 4 or 8.
1644///
1645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1646#[inline]
1647#[target_feature(enable = "avx2")]
1648#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1649#[rustc_legacy_const_generics(2)]
1650#[stable(feature = "simd_x86", since = "1.27.0")]
1651pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1652    static_assert_imm8_scale!(SCALE);
1653    let zero = _mm_setzero_pd();
1654    let neg_one = _mm_set1_pd(-1.0);
1655    let slice = slice as *const i8;
1656    let offsets = offsets.as_i64x2();
1657    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1658}
1659
1660/// Returns values from `slice` at offsets determined by `offsets * scale`,
1661/// where
1662/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1663/// that position instead.
1664///
1665/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1666#[inline]
1667#[target_feature(enable = "avx2")]
1668#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1669#[rustc_legacy_const_generics(4)]
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1672    src: __m128d,
1673    slice: *const f64,
1674    offsets: __m128i,
1675    mask: __m128d,
1676) -> __m128d {
1677    static_assert_imm8_scale!(SCALE);
1678    let slice = slice as *const i8;
1679    let offsets = offsets.as_i64x2();
1680    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1681}
1682
1683/// Returns values from `slice` at offsets determined by `offsets * scale`,
1684/// where
1685/// `scale` should be 1, 2, 4 or 8.
1686///
1687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1688#[inline]
1689#[target_feature(enable = "avx2")]
1690#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1691#[rustc_legacy_const_generics(2)]
1692#[stable(feature = "simd_x86", since = "1.27.0")]
1693pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1694    slice: *const f64,
1695    offsets: __m256i,
1696) -> __m256d {
1697    static_assert_imm8_scale!(SCALE);
1698    let zero = _mm256_setzero_pd();
1699    let neg_one = _mm256_set1_pd(-1.0);
1700    let slice = slice as *const i8;
1701    let offsets = offsets.as_i64x4();
1702    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1703}
1704
1705/// Returns values from `slice` at offsets determined by `offsets * scale`,
1706/// where
1707/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1708/// that position instead.
1709///
1710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1711#[inline]
1712#[target_feature(enable = "avx2")]
1713#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1714#[rustc_legacy_const_generics(4)]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1717    src: __m256d,
1718    slice: *const f64,
1719    offsets: __m256i,
1720    mask: __m256d,
1721) -> __m256d {
1722    static_assert_imm8_scale!(SCALE);
1723    let slice = slice as *const i8;
1724    let offsets = offsets.as_i64x4();
1725    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1726}
1727
1728/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1729/// location specified by `IMM1`.
1730///
1731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1732#[inline]
1733#[target_feature(enable = "avx2")]
1734#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1735#[rustc_legacy_const_generics(2)]
1736#[stable(feature = "simd_x86", since = "1.27.0")]
1737pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1738    static_assert_uimm_bits!(IMM1, 1);
1739    unsafe {
1740        let a = a.as_i64x4();
1741        let b = _mm256_castsi128_si256(b).as_i64x4();
1742        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1743        transmute(dst)
1744    }
1745}
1746
1747/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1748/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1749/// of intermediate 32-bit integers.
1750///
1751/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1752#[inline]
1753#[target_feature(enable = "avx2")]
1754#[cfg_attr(test, assert_instr(vpmaddwd))]
1755#[stable(feature = "simd_x86", since = "1.27.0")]
1756pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1757    unsafe {
1758        let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
1759        let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
1760        let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
1761        simd_add(even, odd).as_m256i()
1762    }
1763}
1764
1765/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1766/// corresponding signed 8-bit integer from `b`, producing intermediate
1767/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1768/// signed 16-bit integers
1769///
1770/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1771#[inline]
1772#[target_feature(enable = "avx2")]
1773#[cfg_attr(test, assert_instr(vpmaddubsw))]
1774#[stable(feature = "simd_x86", since = "1.27.0")]
1775pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1776    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) }
1777}
1778
1779/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1780/// (elements are zeroed out when the highest bit is not set in the
1781/// corresponding element).
1782///
1783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1784#[inline]
1785#[target_feature(enable = "avx2")]
1786#[cfg_attr(test, assert_instr(vpmaskmovd))]
1787#[stable(feature = "simd_x86", since = "1.27.0")]
1788pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1789    transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1790}
1791
1792/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1793/// (elements are zeroed out when the highest bit is not set in the
1794/// corresponding element).
1795///
1796/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1797#[inline]
1798#[target_feature(enable = "avx2")]
1799#[cfg_attr(test, assert_instr(vpmaskmovd))]
1800#[stable(feature = "simd_x86", since = "1.27.0")]
1801pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1802    transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
1803}
1804
1805/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1806/// (elements are zeroed out when the highest bit is not set in the
1807/// corresponding element).
1808///
1809/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1810#[inline]
1811#[target_feature(enable = "avx2")]
1812#[cfg_attr(test, assert_instr(vpmaskmovq))]
1813#[stable(feature = "simd_x86", since = "1.27.0")]
1814pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1815    transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
1816}
1817
1818/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1819/// (elements are zeroed out when the highest bit is not set in the
1820/// corresponding element).
1821///
1822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1823#[inline]
1824#[target_feature(enable = "avx2")]
1825#[cfg_attr(test, assert_instr(vpmaskmovq))]
1826#[stable(feature = "simd_x86", since = "1.27.0")]
1827pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1828    transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
1829}
1830
1831/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1832/// using `mask` (elements are not stored when the highest bit is not set
1833/// in the corresponding element).
1834///
1835/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1836#[inline]
1837#[target_feature(enable = "avx2")]
1838#[cfg_attr(test, assert_instr(vpmaskmovd))]
1839#[stable(feature = "simd_x86", since = "1.27.0")]
1840pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1841    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
1842}
1843
1844/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1845/// using `mask` (elements are not stored when the highest bit is not set
1846/// in the corresponding element).
1847///
1848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1849#[inline]
1850#[target_feature(enable = "avx2")]
1851#[cfg_attr(test, assert_instr(vpmaskmovd))]
1852#[stable(feature = "simd_x86", since = "1.27.0")]
1853pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1854    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
1855}
1856
1857/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1858/// using `mask` (elements are not stored when the highest bit is not set
1859/// in the corresponding element).
1860///
1861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1862#[inline]
1863#[target_feature(enable = "avx2")]
1864#[cfg_attr(test, assert_instr(vpmaskmovq))]
1865#[stable(feature = "simd_x86", since = "1.27.0")]
1866pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1867    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
1868}
1869
1870/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1871/// using `mask` (elements are not stored when the highest bit is not set
1872/// in the corresponding element).
1873///
1874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1875#[inline]
1876#[target_feature(enable = "avx2")]
1877#[cfg_attr(test, assert_instr(vpmaskmovq))]
1878#[stable(feature = "simd_x86", since = "1.27.0")]
1879pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1880    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
1881}
1882
1883/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1884/// maximum values.
1885///
1886/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1887#[inline]
1888#[target_feature(enable = "avx2")]
1889#[cfg_attr(test, assert_instr(vpmaxsw))]
1890#[stable(feature = "simd_x86", since = "1.27.0")]
1891pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1892    unsafe {
1893        let a = a.as_i16x16();
1894        let b = b.as_i16x16();
1895        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1896    }
1897}
1898
1899/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1900/// maximum values.
1901///
1902/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1903#[inline]
1904#[target_feature(enable = "avx2")]
1905#[cfg_attr(test, assert_instr(vpmaxsd))]
1906#[stable(feature = "simd_x86", since = "1.27.0")]
1907pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1908    unsafe {
1909        let a = a.as_i32x8();
1910        let b = b.as_i32x8();
1911        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1912    }
1913}
1914
1915/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1916/// maximum values.
1917///
1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1919#[inline]
1920#[target_feature(enable = "avx2")]
1921#[cfg_attr(test, assert_instr(vpmaxsb))]
1922#[stable(feature = "simd_x86", since = "1.27.0")]
1923pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1924    unsafe {
1925        let a = a.as_i8x32();
1926        let b = b.as_i8x32();
1927        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1928    }
1929}
1930
1931/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1932/// the packed maximum values.
1933///
1934/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
1935#[inline]
1936#[target_feature(enable = "avx2")]
1937#[cfg_attr(test, assert_instr(vpmaxuw))]
1938#[stable(feature = "simd_x86", since = "1.27.0")]
1939pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
1940    unsafe {
1941        let a = a.as_u16x16();
1942        let b = b.as_u16x16();
1943        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1944    }
1945}
1946
1947/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1948/// the packed maximum values.
1949///
1950/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
1951#[inline]
1952#[target_feature(enable = "avx2")]
1953#[cfg_attr(test, assert_instr(vpmaxud))]
1954#[stable(feature = "simd_x86", since = "1.27.0")]
1955pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
1956    unsafe {
1957        let a = a.as_u32x8();
1958        let b = b.as_u32x8();
1959        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1960    }
1961}
1962
1963/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
1964/// the packed maximum values.
1965///
1966/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
1967#[inline]
1968#[target_feature(enable = "avx2")]
1969#[cfg_attr(test, assert_instr(vpmaxub))]
1970#[stable(feature = "simd_x86", since = "1.27.0")]
1971pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
1972    unsafe {
1973        let a = a.as_u8x32();
1974        let b = b.as_u8x32();
1975        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1976    }
1977}
1978
1979/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1980/// minimum values.
1981///
1982/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
1983#[inline]
1984#[target_feature(enable = "avx2")]
1985#[cfg_attr(test, assert_instr(vpminsw))]
1986#[stable(feature = "simd_x86", since = "1.27.0")]
1987pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
1988    unsafe {
1989        let a = a.as_i16x16();
1990        let b = b.as_i16x16();
1991        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
1992    }
1993}
1994
1995/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1996/// minimum values.
1997///
1998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
1999#[inline]
2000#[target_feature(enable = "avx2")]
2001#[cfg_attr(test, assert_instr(vpminsd))]
2002#[stable(feature = "simd_x86", since = "1.27.0")]
2003pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2004    unsafe {
2005        let a = a.as_i32x8();
2006        let b = b.as_i32x8();
2007        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2008    }
2009}
2010
2011/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2012/// minimum values.
2013///
2014/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2015#[inline]
2016#[target_feature(enable = "avx2")]
2017#[cfg_attr(test, assert_instr(vpminsb))]
2018#[stable(feature = "simd_x86", since = "1.27.0")]
2019pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2020    unsafe {
2021        let a = a.as_i8x32();
2022        let b = b.as_i8x32();
2023        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2024    }
2025}
2026
2027/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2028/// the packed minimum values.
2029///
2030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2031#[inline]
2032#[target_feature(enable = "avx2")]
2033#[cfg_attr(test, assert_instr(vpminuw))]
2034#[stable(feature = "simd_x86", since = "1.27.0")]
2035pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2036    unsafe {
2037        let a = a.as_u16x16();
2038        let b = b.as_u16x16();
2039        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2040    }
2041}
2042
2043/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2044/// the packed minimum values.
2045///
2046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2047#[inline]
2048#[target_feature(enable = "avx2")]
2049#[cfg_attr(test, assert_instr(vpminud))]
2050#[stable(feature = "simd_x86", since = "1.27.0")]
2051pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2052    unsafe {
2053        let a = a.as_u32x8();
2054        let b = b.as_u32x8();
2055        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2056    }
2057}
2058
2059/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2060/// the packed minimum values.
2061///
2062/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2063#[inline]
2064#[target_feature(enable = "avx2")]
2065#[cfg_attr(test, assert_instr(vpminub))]
2066#[stable(feature = "simd_x86", since = "1.27.0")]
2067pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2068    unsafe {
2069        let a = a.as_u8x32();
2070        let b = b.as_u8x32();
2071        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2072    }
2073}
2074
2075/// Creates mask from the most significant bit of each 8-bit element in `a`,
2076/// return the result.
2077///
2078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2079#[inline]
2080#[target_feature(enable = "avx2")]
2081#[cfg_attr(test, assert_instr(vpmovmskb))]
2082#[stable(feature = "simd_x86", since = "1.27.0")]
2083pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2084    unsafe {
2085        let z = i8x32::ZERO;
2086        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2087        simd_bitmask::<_, u32>(m) as i32
2088    }
2089}
2090
2091/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2092/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2093/// results in dst. Eight SADs are performed for each 128-bit lane using one
2094/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2095/// selected from `b` starting at on the offset specified in `imm8`. Eight
2096/// quadruplets are formed from sequential 8-bit integers selected from `a`
2097/// starting at the offset specified in `imm8`.
2098///
2099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2100#[inline]
2101#[target_feature(enable = "avx2")]
2102#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2103#[rustc_legacy_const_generics(2)]
2104#[stable(feature = "simd_x86", since = "1.27.0")]
2105pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2106    static_assert_uimm_bits!(IMM8, 8);
2107    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2108}
2109
2110/// Multiplies the low 32-bit integers from each packed 64-bit element in
2111/// `a` and `b`
2112///
2113/// Returns the 64-bit results.
2114///
2115/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2116#[inline]
2117#[target_feature(enable = "avx2")]
2118#[cfg_attr(test, assert_instr(vpmuldq))]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2121    unsafe {
2122        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2123        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2124        transmute(simd_mul(a, b))
2125    }
2126}
2127
2128/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2129/// element in `a` and `b`
2130///
2131/// Returns the unsigned 64-bit results.
2132///
2133/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2134#[inline]
2135#[target_feature(enable = "avx2")]
2136#[cfg_attr(test, assert_instr(vpmuludq))]
2137#[stable(feature = "simd_x86", since = "1.27.0")]
2138pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2139    unsafe {
2140        let a = a.as_u64x4();
2141        let b = b.as_u64x4();
2142        let mask = u64x4::splat(u32::MAX.into());
2143        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2144    }
2145}
2146
2147/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2148/// intermediate 32-bit integers and returning the high 16 bits of the
2149/// intermediate integers.
2150///
2151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2152#[inline]
2153#[target_feature(enable = "avx2")]
2154#[cfg_attr(test, assert_instr(vpmulhw))]
2155#[stable(feature = "simd_x86", since = "1.27.0")]
2156pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2157    unsafe {
2158        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2159        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2160        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2161        transmute(simd_cast::<i32x16, i16x16>(r))
2162    }
2163}
2164
2165/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2166/// intermediate 32-bit integers and returning the high 16 bits of the
2167/// intermediate integers.
2168///
2169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2170#[inline]
2171#[target_feature(enable = "avx2")]
2172#[cfg_attr(test, assert_instr(vpmulhuw))]
2173#[stable(feature = "simd_x86", since = "1.27.0")]
2174pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2175    unsafe {
2176        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2177        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2178        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2179        transmute(simd_cast::<u32x16, u16x16>(r))
2180    }
2181}
2182
2183/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2184/// intermediate 32-bit integers, and returns the low 16 bits of the
2185/// intermediate integers
2186///
2187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2188#[inline]
2189#[target_feature(enable = "avx2")]
2190#[cfg_attr(test, assert_instr(vpmullw))]
2191#[stable(feature = "simd_x86", since = "1.27.0")]
2192pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2193    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2194}
2195
2196/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2197/// intermediate 64-bit integers, and returns the low 32 bits of the
2198/// intermediate integers
2199///
2200/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2201#[inline]
2202#[target_feature(enable = "avx2")]
2203#[cfg_attr(test, assert_instr(vpmulld))]
2204#[stable(feature = "simd_x86", since = "1.27.0")]
2205pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2206    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2207}
2208
2209/// Multiplies packed 16-bit integers in `a` and `b`, producing
2210/// intermediate signed 32-bit integers. Truncate each intermediate
2211/// integer to the 18 most significant bits, round by adding 1, and
2212/// return bits `[16:1]`.
2213///
2214/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2215#[inline]
2216#[target_feature(enable = "avx2")]
2217#[cfg_attr(test, assert_instr(vpmulhrsw))]
2218#[stable(feature = "simd_x86", since = "1.27.0")]
2219pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2220    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2221}
2222
2223/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2224/// and `b`
2225///
2226/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2227#[inline]
2228#[target_feature(enable = "avx2")]
2229#[cfg_attr(test, assert_instr(vorps))]
2230#[stable(feature = "simd_x86", since = "1.27.0")]
2231pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2232    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2233}
2234
2235/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2236/// using signed saturation
2237///
2238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2239#[inline]
2240#[target_feature(enable = "avx2")]
2241#[cfg_attr(test, assert_instr(vpacksswb))]
2242#[stable(feature = "simd_x86", since = "1.27.0")]
2243pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2244    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2245}
2246
2247/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2248/// using signed saturation
2249///
2250/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2251#[inline]
2252#[target_feature(enable = "avx2")]
2253#[cfg_attr(test, assert_instr(vpackssdw))]
2254#[stable(feature = "simd_x86", since = "1.27.0")]
2255pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2256    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2257}
2258
2259/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2260/// using unsigned saturation
2261///
2262/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2263#[inline]
2264#[target_feature(enable = "avx2")]
2265#[cfg_attr(test, assert_instr(vpackuswb))]
2266#[stable(feature = "simd_x86", since = "1.27.0")]
2267pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2268    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2269}
2270
2271/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2272/// using unsigned saturation
2273///
2274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2275#[inline]
2276#[target_feature(enable = "avx2")]
2277#[cfg_attr(test, assert_instr(vpackusdw))]
2278#[stable(feature = "simd_x86", since = "1.27.0")]
2279pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2280    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2281}
2282
2283/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2284///
2285/// The last 3 bits of each integer of `b` are used as addresses into the 8
2286/// integers of `a`.
2287///
2288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2289#[inline]
2290#[target_feature(enable = "avx2")]
2291#[cfg_attr(test, assert_instr(vpermps))]
2292#[stable(feature = "simd_x86", since = "1.27.0")]
2293pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2294    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2295}
2296
2297/// Permutes 64-bit integers from `a` using control mask `imm8`.
2298///
2299/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2300#[inline]
2301#[target_feature(enable = "avx2")]
2302#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2303#[rustc_legacy_const_generics(1)]
2304#[stable(feature = "simd_x86", since = "1.27.0")]
2305pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2306    static_assert_uimm_bits!(IMM8, 8);
2307    unsafe {
2308        let zero = i64x4::ZERO;
2309        let r: i64x4 = simd_shuffle!(
2310            a.as_i64x4(),
2311            zero,
2312            [
2313                IMM8 as u32 & 0b11,
2314                (IMM8 as u32 >> 2) & 0b11,
2315                (IMM8 as u32 >> 4) & 0b11,
2316                (IMM8 as u32 >> 6) & 0b11,
2317            ],
2318        );
2319        transmute(r)
2320    }
2321}
2322
2323/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2324///
2325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2326#[inline]
2327#[target_feature(enable = "avx2")]
2328#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2329#[rustc_legacy_const_generics(2)]
2330#[stable(feature = "simd_x86", since = "1.27.0")]
2331pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2332    static_assert_uimm_bits!(IMM8, 8);
2333    _mm256_permute2f128_si256::<IMM8>(a, b)
2334}
2335
2336/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2337/// control in `imm8`.
2338///
2339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2340#[inline]
2341#[target_feature(enable = "avx2")]
2342#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2343#[rustc_legacy_const_generics(1)]
2344#[stable(feature = "simd_x86", since = "1.27.0")]
2345pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2346    static_assert_uimm_bits!(IMM8, 8);
2347    unsafe {
2348        simd_shuffle!(
2349            a,
2350            _mm256_undefined_pd(),
2351            [
2352                IMM8 as u32 & 0b11,
2353                (IMM8 as u32 >> 2) & 0b11,
2354                (IMM8 as u32 >> 4) & 0b11,
2355                (IMM8 as u32 >> 6) & 0b11,
2356            ],
2357        )
2358    }
2359}
2360
2361/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2362/// the corresponding 32-bit integer index in `idx`.
2363///
2364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2365#[inline]
2366#[target_feature(enable = "avx2")]
2367#[cfg_attr(test, assert_instr(vpermps))]
2368#[stable(feature = "simd_x86", since = "1.27.0")]
2369pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2370    unsafe { permps(a, idx.as_i32x8()) }
2371}
2372
2373/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2374/// and `b`, then horizontally sum each consecutive 8 differences to
2375/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2376/// integers in the low 16 bits of the 64-bit return value
2377///
2378/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2379#[inline]
2380#[target_feature(enable = "avx2")]
2381#[cfg_attr(test, assert_instr(vpsadbw))]
2382#[stable(feature = "simd_x86", since = "1.27.0")]
2383pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2384    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2385}
2386
2387/// Shuffles bytes from `a` according to the content of `b`.
2388///
2389/// For each of the 128-bit low and high halves of the vectors, the last
2390/// 4 bits of each byte of `b` are used as addresses into the respective
2391/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2392///
2393/// In addition, if the highest significant bit of a byte of `b` is set, the
2394/// respective destination byte is set to 0.
2395///
2396/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2397/// equivalent to:
2398///
2399/// ```
2400/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2401///     let mut r = [0; 32];
2402///     for i in 0..16 {
2403///         // if the most significant bit of b is set,
2404///         // then the destination byte is set to 0.
2405///         if b[i] & 0x80 == 0u8 {
2406///             r[i] = a[(b[i] % 16) as usize];
2407///         }
2408///         if b[i + 16] & 0x80 == 0u8 {
2409///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2410///         }
2411///     }
2412///     r
2413/// }
2414/// ```
2415///
2416/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2417#[inline]
2418#[target_feature(enable = "avx2")]
2419#[cfg_attr(test, assert_instr(vpshufb))]
2420#[stable(feature = "simd_x86", since = "1.27.0")]
2421pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2422    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2423}
2424
2425/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2426/// `imm8`.
2427///
2428/// ```rust
2429/// #[cfg(target_arch = "x86")]
2430/// use std::arch::x86::*;
2431/// #[cfg(target_arch = "x86_64")]
2432/// use std::arch::x86_64::*;
2433///
2434/// # fn main() {
2435/// #     if is_x86_feature_detected!("avx2") {
2436/// #         #[target_feature(enable = "avx2")]
2437/// #         unsafe fn worker() {
2438/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2439///
2440/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2441/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2442///
2443/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2444/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2445///
2446/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2447/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2448/// #         }
2449/// #         unsafe { worker(); }
2450/// #     }
2451/// # }
2452/// ```
2453///
2454/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2455#[inline]
2456#[target_feature(enable = "avx2")]
2457#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2458#[rustc_legacy_const_generics(1)]
2459#[stable(feature = "simd_x86", since = "1.27.0")]
2460pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2461    static_assert_uimm_bits!(MASK, 8);
2462    unsafe {
2463        let r: i32x8 = simd_shuffle!(
2464            a.as_i32x8(),
2465            a.as_i32x8(),
2466            [
2467                MASK as u32 & 0b11,
2468                (MASK as u32 >> 2) & 0b11,
2469                (MASK as u32 >> 4) & 0b11,
2470                (MASK as u32 >> 6) & 0b11,
2471                (MASK as u32 & 0b11) + 4,
2472                ((MASK as u32 >> 2) & 0b11) + 4,
2473                ((MASK as u32 >> 4) & 0b11) + 4,
2474                ((MASK as u32 >> 6) & 0b11) + 4,
2475            ],
2476        );
2477        transmute(r)
2478    }
2479}
2480
2481/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2482/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2483/// to the output.
2484///
2485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2486#[inline]
2487#[target_feature(enable = "avx2")]
2488#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2489#[rustc_legacy_const_generics(1)]
2490#[stable(feature = "simd_x86", since = "1.27.0")]
2491pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2492    static_assert_uimm_bits!(IMM8, 8);
2493    unsafe {
2494        let a = a.as_i16x16();
2495        let r: i16x16 = simd_shuffle!(
2496            a,
2497            a,
2498            [
2499                0,
2500                1,
2501                2,
2502                3,
2503                4 + (IMM8 as u32 & 0b11),
2504                4 + ((IMM8 as u32 >> 2) & 0b11),
2505                4 + ((IMM8 as u32 >> 4) & 0b11),
2506                4 + ((IMM8 as u32 >> 6) & 0b11),
2507                8,
2508                9,
2509                10,
2510                11,
2511                12 + (IMM8 as u32 & 0b11),
2512                12 + ((IMM8 as u32 >> 2) & 0b11),
2513                12 + ((IMM8 as u32 >> 4) & 0b11),
2514                12 + ((IMM8 as u32 >> 6) & 0b11),
2515            ],
2516        );
2517        transmute(r)
2518    }
2519}
2520
2521/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2522/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2523/// to the output.
2524///
2525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2526#[inline]
2527#[target_feature(enable = "avx2")]
2528#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2529#[rustc_legacy_const_generics(1)]
2530#[stable(feature = "simd_x86", since = "1.27.0")]
2531pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2532    static_assert_uimm_bits!(IMM8, 8);
2533    unsafe {
2534        let a = a.as_i16x16();
2535        let r: i16x16 = simd_shuffle!(
2536            a,
2537            a,
2538            [
2539                0 + (IMM8 as u32 & 0b11),
2540                0 + ((IMM8 as u32 >> 2) & 0b11),
2541                0 + ((IMM8 as u32 >> 4) & 0b11),
2542                0 + ((IMM8 as u32 >> 6) & 0b11),
2543                4,
2544                5,
2545                6,
2546                7,
2547                8 + (IMM8 as u32 & 0b11),
2548                8 + ((IMM8 as u32 >> 2) & 0b11),
2549                8 + ((IMM8 as u32 >> 4) & 0b11),
2550                8 + ((IMM8 as u32 >> 6) & 0b11),
2551                12,
2552                13,
2553                14,
2554                15,
2555            ],
2556        );
2557        transmute(r)
2558    }
2559}
2560
2561/// Negates packed 16-bit integers in `a` when the corresponding signed
2562/// 16-bit integer in `b` is negative, and returns the results.
2563/// Results are zeroed out when the corresponding element in `b` is zero.
2564///
2565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2566#[inline]
2567#[target_feature(enable = "avx2")]
2568#[cfg_attr(test, assert_instr(vpsignw))]
2569#[stable(feature = "simd_x86", since = "1.27.0")]
2570pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2571    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2572}
2573
2574/// Negates packed 32-bit integers in `a` when the corresponding signed
2575/// 32-bit integer in `b` is negative, and returns the results.
2576/// Results are zeroed out when the corresponding element in `b` is zero.
2577///
2578/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2579#[inline]
2580#[target_feature(enable = "avx2")]
2581#[cfg_attr(test, assert_instr(vpsignd))]
2582#[stable(feature = "simd_x86", since = "1.27.0")]
2583pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2584    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2585}
2586
2587/// Negates packed 8-bit integers in `a` when the corresponding signed
2588/// 8-bit integer in `b` is negative, and returns the results.
2589/// Results are zeroed out when the corresponding element in `b` is zero.
2590///
2591/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2592#[inline]
2593#[target_feature(enable = "avx2")]
2594#[cfg_attr(test, assert_instr(vpsignb))]
2595#[stable(feature = "simd_x86", since = "1.27.0")]
2596pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2597    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2598}
2599
2600/// Shifts packed 16-bit integers in `a` left by `count` while
2601/// shifting in zeros, and returns the result
2602///
2603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2604#[inline]
2605#[target_feature(enable = "avx2")]
2606#[cfg_attr(test, assert_instr(vpsllw))]
2607#[stable(feature = "simd_x86", since = "1.27.0")]
2608pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2609    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2610}
2611
2612/// Shifts packed 32-bit integers in `a` left by `count` while
2613/// shifting in zeros, and returns the result
2614///
2615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2616#[inline]
2617#[target_feature(enable = "avx2")]
2618#[cfg_attr(test, assert_instr(vpslld))]
2619#[stable(feature = "simd_x86", since = "1.27.0")]
2620pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2621    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2622}
2623
2624/// Shifts packed 64-bit integers in `a` left by `count` while
2625/// shifting in zeros, and returns the result
2626///
2627/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2628#[inline]
2629#[target_feature(enable = "avx2")]
2630#[cfg_attr(test, assert_instr(vpsllq))]
2631#[stable(feature = "simd_x86", since = "1.27.0")]
2632pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2633    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2634}
2635
2636/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2637/// shifting in zeros, return the results;
2638///
2639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2640#[inline]
2641#[target_feature(enable = "avx2")]
2642#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2643#[rustc_legacy_const_generics(1)]
2644#[stable(feature = "simd_x86", since = "1.27.0")]
2645pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2646    static_assert_uimm_bits!(IMM8, 8);
2647    unsafe {
2648        if IMM8 >= 16 {
2649            _mm256_setzero_si256()
2650        } else {
2651            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2652        }
2653    }
2654}
2655
2656/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2657/// shifting in zeros, return the results;
2658///
2659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2660#[inline]
2661#[target_feature(enable = "avx2")]
2662#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2663#[rustc_legacy_const_generics(1)]
2664#[stable(feature = "simd_x86", since = "1.27.0")]
2665pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2666    unsafe {
2667        static_assert_uimm_bits!(IMM8, 8);
2668        if IMM8 >= 32 {
2669            _mm256_setzero_si256()
2670        } else {
2671            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2672        }
2673    }
2674}
2675
2676/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2677/// shifting in zeros, return the results;
2678///
2679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2680#[inline]
2681#[target_feature(enable = "avx2")]
2682#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2683#[rustc_legacy_const_generics(1)]
2684#[stable(feature = "simd_x86", since = "1.27.0")]
2685pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2686    unsafe {
2687        static_assert_uimm_bits!(IMM8, 8);
2688        if IMM8 >= 64 {
2689            _mm256_setzero_si256()
2690        } else {
2691            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2692        }
2693    }
2694}
2695
2696/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2697///
2698/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2699#[inline]
2700#[target_feature(enable = "avx2")]
2701#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2702#[rustc_legacy_const_generics(1)]
2703#[stable(feature = "simd_x86", since = "1.27.0")]
2704pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2705    static_assert_uimm_bits!(IMM8, 8);
2706    _mm256_bslli_epi128::<IMM8>(a)
2707}
2708
2709/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2710///
2711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2712#[inline]
2713#[target_feature(enable = "avx2")]
2714#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2715#[rustc_legacy_const_generics(1)]
2716#[stable(feature = "simd_x86", since = "1.27.0")]
2717pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2718    static_assert_uimm_bits!(IMM8, 8);
2719    const fn mask(shift: i32, i: u32) -> u32 {
2720        let shift = shift as u32 & 0xff;
2721        if shift > 15 || i % 16 < shift {
2722            0
2723        } else {
2724            32 + (i - shift)
2725        }
2726    }
2727    unsafe {
2728        let a = a.as_i8x32();
2729        let r: i8x32 = simd_shuffle!(
2730            i8x32::ZERO,
2731            a,
2732            [
2733                mask(IMM8, 0),
2734                mask(IMM8, 1),
2735                mask(IMM8, 2),
2736                mask(IMM8, 3),
2737                mask(IMM8, 4),
2738                mask(IMM8, 5),
2739                mask(IMM8, 6),
2740                mask(IMM8, 7),
2741                mask(IMM8, 8),
2742                mask(IMM8, 9),
2743                mask(IMM8, 10),
2744                mask(IMM8, 11),
2745                mask(IMM8, 12),
2746                mask(IMM8, 13),
2747                mask(IMM8, 14),
2748                mask(IMM8, 15),
2749                mask(IMM8, 16),
2750                mask(IMM8, 17),
2751                mask(IMM8, 18),
2752                mask(IMM8, 19),
2753                mask(IMM8, 20),
2754                mask(IMM8, 21),
2755                mask(IMM8, 22),
2756                mask(IMM8, 23),
2757                mask(IMM8, 24),
2758                mask(IMM8, 25),
2759                mask(IMM8, 26),
2760                mask(IMM8, 27),
2761                mask(IMM8, 28),
2762                mask(IMM8, 29),
2763                mask(IMM8, 30),
2764                mask(IMM8, 31),
2765            ],
2766        );
2767        transmute(r)
2768    }
2769}
2770
2771/// Shifts packed 32-bit integers in `a` left by the amount
2772/// specified by the corresponding element in `count` while
2773/// shifting in zeros, and returns the result.
2774///
2775/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2776#[inline]
2777#[target_feature(enable = "avx2")]
2778#[cfg_attr(test, assert_instr(vpsllvd))]
2779#[stable(feature = "simd_x86", since = "1.27.0")]
2780pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2781    unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
2782}
2783
2784/// Shifts packed 32-bit integers in `a` left by the amount
2785/// specified by the corresponding element in `count` while
2786/// shifting in zeros, and returns the result.
2787///
2788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2789#[inline]
2790#[target_feature(enable = "avx2")]
2791#[cfg_attr(test, assert_instr(vpsllvd))]
2792#[stable(feature = "simd_x86", since = "1.27.0")]
2793pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2794    unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
2795}
2796
2797/// Shifts packed 64-bit integers in `a` left by the amount
2798/// specified by the corresponding element in `count` while
2799/// shifting in zeros, and returns the result.
2800///
2801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2802#[inline]
2803#[target_feature(enable = "avx2")]
2804#[cfg_attr(test, assert_instr(vpsllvq))]
2805#[stable(feature = "simd_x86", since = "1.27.0")]
2806pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2807    unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
2808}
2809
2810/// Shifts packed 64-bit integers in `a` left by the amount
2811/// specified by the corresponding element in `count` while
2812/// shifting in zeros, and returns the result.
2813///
2814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2815#[inline]
2816#[target_feature(enable = "avx2")]
2817#[cfg_attr(test, assert_instr(vpsllvq))]
2818#[stable(feature = "simd_x86", since = "1.27.0")]
2819pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2820    unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
2821}
2822
2823/// Shifts packed 16-bit integers in `a` right by `count` while
2824/// shifting in sign bits.
2825///
2826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2827#[inline]
2828#[target_feature(enable = "avx2")]
2829#[cfg_attr(test, assert_instr(vpsraw))]
2830#[stable(feature = "simd_x86", since = "1.27.0")]
2831pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2832    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2833}
2834
2835/// Shifts packed 32-bit integers in `a` right by `count` while
2836/// shifting in sign bits.
2837///
2838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2839#[inline]
2840#[target_feature(enable = "avx2")]
2841#[cfg_attr(test, assert_instr(vpsrad))]
2842#[stable(feature = "simd_x86", since = "1.27.0")]
2843pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2844    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2845}
2846
2847/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2848/// shifting in sign bits.
2849///
2850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2851#[inline]
2852#[target_feature(enable = "avx2")]
2853#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2854#[rustc_legacy_const_generics(1)]
2855#[stable(feature = "simd_x86", since = "1.27.0")]
2856pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2857    static_assert_uimm_bits!(IMM8, 8);
2858    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2859}
2860
2861/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2862/// shifting in sign bits.
2863///
2864/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2865#[inline]
2866#[target_feature(enable = "avx2")]
2867#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2868#[rustc_legacy_const_generics(1)]
2869#[stable(feature = "simd_x86", since = "1.27.0")]
2870pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2871    static_assert_uimm_bits!(IMM8, 8);
2872    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2873}
2874
2875/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2876/// corresponding element in `count` while shifting in sign bits.
2877///
2878/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2879#[inline]
2880#[target_feature(enable = "avx2")]
2881#[cfg_attr(test, assert_instr(vpsravd))]
2882#[stable(feature = "simd_x86", since = "1.27.0")]
2883pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2884    unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
2885}
2886
2887/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2888/// corresponding element in `count` while shifting in sign bits.
2889///
2890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2891#[inline]
2892#[target_feature(enable = "avx2")]
2893#[cfg_attr(test, assert_instr(vpsravd))]
2894#[stable(feature = "simd_x86", since = "1.27.0")]
2895pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2896    unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
2897}
2898
2899/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2900///
2901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
2902#[inline]
2903#[target_feature(enable = "avx2")]
2904#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2905#[rustc_legacy_const_generics(1)]
2906#[stable(feature = "simd_x86", since = "1.27.0")]
2907pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2908    static_assert_uimm_bits!(IMM8, 8);
2909    _mm256_bsrli_epi128::<IMM8>(a)
2910}
2911
2912/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2913///
2914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
2915#[inline]
2916#[target_feature(enable = "avx2")]
2917#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2918#[rustc_legacy_const_generics(1)]
2919#[stable(feature = "simd_x86", since = "1.27.0")]
2920pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2921    static_assert_uimm_bits!(IMM8, 8);
2922    const fn mask(shift: i32, i: u32) -> u32 {
2923        let shift = shift as u32 & 0xff;
2924        if shift > 15 || (15 - (i % 16)) < shift {
2925            0
2926        } else {
2927            32 + (i + shift)
2928        }
2929    }
2930    unsafe {
2931        let a = a.as_i8x32();
2932        let r: i8x32 = simd_shuffle!(
2933            i8x32::ZERO,
2934            a,
2935            [
2936                mask(IMM8, 0),
2937                mask(IMM8, 1),
2938                mask(IMM8, 2),
2939                mask(IMM8, 3),
2940                mask(IMM8, 4),
2941                mask(IMM8, 5),
2942                mask(IMM8, 6),
2943                mask(IMM8, 7),
2944                mask(IMM8, 8),
2945                mask(IMM8, 9),
2946                mask(IMM8, 10),
2947                mask(IMM8, 11),
2948                mask(IMM8, 12),
2949                mask(IMM8, 13),
2950                mask(IMM8, 14),
2951                mask(IMM8, 15),
2952                mask(IMM8, 16),
2953                mask(IMM8, 17),
2954                mask(IMM8, 18),
2955                mask(IMM8, 19),
2956                mask(IMM8, 20),
2957                mask(IMM8, 21),
2958                mask(IMM8, 22),
2959                mask(IMM8, 23),
2960                mask(IMM8, 24),
2961                mask(IMM8, 25),
2962                mask(IMM8, 26),
2963                mask(IMM8, 27),
2964                mask(IMM8, 28),
2965                mask(IMM8, 29),
2966                mask(IMM8, 30),
2967                mask(IMM8, 31),
2968            ],
2969        );
2970        transmute(r)
2971    }
2972}
2973
2974/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
2975/// zeros.
2976///
2977/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
2978#[inline]
2979#[target_feature(enable = "avx2")]
2980#[cfg_attr(test, assert_instr(vpsrlw))]
2981#[stable(feature = "simd_x86", since = "1.27.0")]
2982pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
2983    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
2984}
2985
2986/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
2987/// zeros.
2988///
2989/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
2990#[inline]
2991#[target_feature(enable = "avx2")]
2992#[cfg_attr(test, assert_instr(vpsrld))]
2993#[stable(feature = "simd_x86", since = "1.27.0")]
2994pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
2995    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
2996}
2997
2998/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
2999/// zeros.
3000///
3001/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3002#[inline]
3003#[target_feature(enable = "avx2")]
3004#[cfg_attr(test, assert_instr(vpsrlq))]
3005#[stable(feature = "simd_x86", since = "1.27.0")]
3006pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3007    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3008}
3009
3010/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3011/// zeros
3012///
3013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3014#[inline]
3015#[target_feature(enable = "avx2")]
3016#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3017#[rustc_legacy_const_generics(1)]
3018#[stable(feature = "simd_x86", since = "1.27.0")]
3019pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3020    static_assert_uimm_bits!(IMM8, 8);
3021    unsafe {
3022        if IMM8 >= 16 {
3023            _mm256_setzero_si256()
3024        } else {
3025            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3026        }
3027    }
3028}
3029
3030/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3031/// zeros
3032///
3033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3034#[inline]
3035#[target_feature(enable = "avx2")]
3036#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3037#[rustc_legacy_const_generics(1)]
3038#[stable(feature = "simd_x86", since = "1.27.0")]
3039pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3040    static_assert_uimm_bits!(IMM8, 8);
3041    unsafe {
3042        if IMM8 >= 32 {
3043            _mm256_setzero_si256()
3044        } else {
3045            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3046        }
3047    }
3048}
3049
3050/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3051/// zeros
3052///
3053/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3054#[inline]
3055#[target_feature(enable = "avx2")]
3056#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3057#[rustc_legacy_const_generics(1)]
3058#[stable(feature = "simd_x86", since = "1.27.0")]
3059pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3060    static_assert_uimm_bits!(IMM8, 8);
3061    unsafe {
3062        if IMM8 >= 64 {
3063            _mm256_setzero_si256()
3064        } else {
3065            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3066        }
3067    }
3068}
3069
3070/// Shifts packed 32-bit integers in `a` right by the amount specified by
3071/// the corresponding element in `count` while shifting in zeros,
3072///
3073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3074#[inline]
3075#[target_feature(enable = "avx2")]
3076#[cfg_attr(test, assert_instr(vpsrlvd))]
3077#[stable(feature = "simd_x86", since = "1.27.0")]
3078pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3079    unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
3080}
3081
3082/// Shifts packed 32-bit integers in `a` right by the amount specified by
3083/// the corresponding element in `count` while shifting in zeros,
3084///
3085/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3086#[inline]
3087#[target_feature(enable = "avx2")]
3088#[cfg_attr(test, assert_instr(vpsrlvd))]
3089#[stable(feature = "simd_x86", since = "1.27.0")]
3090pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3091    unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
3092}
3093
3094/// Shifts packed 64-bit integers in `a` right by the amount specified by
3095/// the corresponding element in `count` while shifting in zeros,
3096///
3097/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3098#[inline]
3099#[target_feature(enable = "avx2")]
3100#[cfg_attr(test, assert_instr(vpsrlvq))]
3101#[stable(feature = "simd_x86", since = "1.27.0")]
3102pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3103    unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
3104}
3105
3106/// Shifts packed 64-bit integers in `a` right by the amount specified by
3107/// the corresponding element in `count` while shifting in zeros,
3108///
3109/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3110#[inline]
3111#[target_feature(enable = "avx2")]
3112#[cfg_attr(test, assert_instr(vpsrlvq))]
3113#[stable(feature = "simd_x86", since = "1.27.0")]
3114pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3115    unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
3116}
3117
3118/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3119/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3120/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3121///
3122/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3123#[inline]
3124#[target_feature(enable = "avx2")]
3125#[cfg_attr(test, assert_instr(vmovntdqa))]
3126#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3127pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3128    let dst: __m256i;
3129    crate::arch::asm!(
3130        vpl!("vmovntdqa {a}"),
3131        a = out(ymm_reg) dst,
3132        p = in(reg) mem_addr,
3133        options(pure, readonly, nostack, preserves_flags),
3134    );
3135    dst
3136}
3137
3138/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3139///
3140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3141#[inline]
3142#[target_feature(enable = "avx2")]
3143#[cfg_attr(test, assert_instr(vpsubw))]
3144#[stable(feature = "simd_x86", since = "1.27.0")]
3145pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3146    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3147}
3148
3149/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3150///
3151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3152#[inline]
3153#[target_feature(enable = "avx2")]
3154#[cfg_attr(test, assert_instr(vpsubd))]
3155#[stable(feature = "simd_x86", since = "1.27.0")]
3156pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3157    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3158}
3159
3160/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3161///
3162/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3163#[inline]
3164#[target_feature(enable = "avx2")]
3165#[cfg_attr(test, assert_instr(vpsubq))]
3166#[stable(feature = "simd_x86", since = "1.27.0")]
3167pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3168    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3169}
3170
3171/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3172///
3173/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3174#[inline]
3175#[target_feature(enable = "avx2")]
3176#[cfg_attr(test, assert_instr(vpsubb))]
3177#[stable(feature = "simd_x86", since = "1.27.0")]
3178pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3179    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3180}
3181
3182/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3183/// `a` using saturation.
3184///
3185/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3186#[inline]
3187#[target_feature(enable = "avx2")]
3188#[cfg_attr(test, assert_instr(vpsubsw))]
3189#[stable(feature = "simd_x86", since = "1.27.0")]
3190pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3191    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3192}
3193
3194/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3195/// `a` using saturation.
3196///
3197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3198#[inline]
3199#[target_feature(enable = "avx2")]
3200#[cfg_attr(test, assert_instr(vpsubsb))]
3201#[stable(feature = "simd_x86", since = "1.27.0")]
3202pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3203    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3204}
3205
3206/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3207/// integers in `a` using saturation.
3208///
3209/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3210#[inline]
3211#[target_feature(enable = "avx2")]
3212#[cfg_attr(test, assert_instr(vpsubusw))]
3213#[stable(feature = "simd_x86", since = "1.27.0")]
3214pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3215    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3216}
3217
3218/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3219/// integers in `a` using saturation.
3220///
3221/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3222#[inline]
3223#[target_feature(enable = "avx2")]
3224#[cfg_attr(test, assert_instr(vpsubusb))]
3225#[stable(feature = "simd_x86", since = "1.27.0")]
3226pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3227    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3228}
3229
3230/// Unpacks and interleave 8-bit integers from the high half of each
3231/// 128-bit lane in `a` and `b`.
3232///
3233/// ```rust
3234/// #[cfg(target_arch = "x86")]
3235/// use std::arch::x86::*;
3236/// #[cfg(target_arch = "x86_64")]
3237/// use std::arch::x86_64::*;
3238///
3239/// # fn main() {
3240/// #     if is_x86_feature_detected!("avx2") {
3241/// #         #[target_feature(enable = "avx2")]
3242/// #         unsafe fn worker() {
3243/// let a = _mm256_setr_epi8(
3244///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3245///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3246/// );
3247/// let b = _mm256_setr_epi8(
3248///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3249///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3250///     -30, -31,
3251/// );
3252///
3253/// let c = _mm256_unpackhi_epi8(a, b);
3254///
3255/// let expected = _mm256_setr_epi8(
3256///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3257///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3258///     -31,
3259/// );
3260/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3261///
3262/// #         }
3263/// #         unsafe { worker(); }
3264/// #     }
3265/// # }
3266/// ```
3267///
3268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3269#[inline]
3270#[target_feature(enable = "avx2")]
3271#[cfg_attr(test, assert_instr(vpunpckhbw))]
3272#[stable(feature = "simd_x86", since = "1.27.0")]
3273pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3274    unsafe {
3275        #[rustfmt::skip]
3276        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3277                8, 40, 9, 41, 10, 42, 11, 43,
3278                12, 44, 13, 45, 14, 46, 15, 47,
3279                24, 56, 25, 57, 26, 58, 27, 59,
3280                28, 60, 29, 61, 30, 62, 31, 63,
3281        ]);
3282        transmute(r)
3283    }
3284}
3285
3286/// Unpacks and interleave 8-bit integers from the low half of each
3287/// 128-bit lane of `a` and `b`.
3288///
3289/// ```rust
3290/// #[cfg(target_arch = "x86")]
3291/// use std::arch::x86::*;
3292/// #[cfg(target_arch = "x86_64")]
3293/// use std::arch::x86_64::*;
3294///
3295/// # fn main() {
3296/// #     if is_x86_feature_detected!("avx2") {
3297/// #         #[target_feature(enable = "avx2")]
3298/// #         unsafe fn worker() {
3299/// let a = _mm256_setr_epi8(
3300///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3301///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3302/// );
3303/// let b = _mm256_setr_epi8(
3304///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3305///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3306///     -30, -31,
3307/// );
3308///
3309/// let c = _mm256_unpacklo_epi8(a, b);
3310///
3311/// let expected = _mm256_setr_epi8(
3312///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3313///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3314/// );
3315/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3316///
3317/// #         }
3318/// #         unsafe { worker(); }
3319/// #     }
3320/// # }
3321/// ```
3322///
3323/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3324#[inline]
3325#[target_feature(enable = "avx2")]
3326#[cfg_attr(test, assert_instr(vpunpcklbw))]
3327#[stable(feature = "simd_x86", since = "1.27.0")]
3328pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3329    unsafe {
3330        #[rustfmt::skip]
3331        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3332            0, 32, 1, 33, 2, 34, 3, 35,
3333            4, 36, 5, 37, 6, 38, 7, 39,
3334            16, 48, 17, 49, 18, 50, 19, 51,
3335            20, 52, 21, 53, 22, 54, 23, 55,
3336        ]);
3337        transmute(r)
3338    }
3339}
3340
3341/// Unpacks and interleave 16-bit integers from the high half of each
3342/// 128-bit lane of `a` and `b`.
3343///
3344/// ```rust
3345/// #[cfg(target_arch = "x86")]
3346/// use std::arch::x86::*;
3347/// #[cfg(target_arch = "x86_64")]
3348/// use std::arch::x86_64::*;
3349///
3350/// # fn main() {
3351/// #     if is_x86_feature_detected!("avx2") {
3352/// #         #[target_feature(enable = "avx2")]
3353/// #         unsafe fn worker() {
3354/// let a = _mm256_setr_epi16(
3355///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3356/// );
3357/// let b = _mm256_setr_epi16(
3358///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3359/// );
3360///
3361/// let c = _mm256_unpackhi_epi16(a, b);
3362///
3363/// let expected = _mm256_setr_epi16(
3364///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3365/// );
3366/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3367///
3368/// #         }
3369/// #         unsafe { worker(); }
3370/// #     }
3371/// # }
3372/// ```
3373///
3374/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3375#[inline]
3376#[target_feature(enable = "avx2")]
3377#[cfg_attr(test, assert_instr(vpunpckhwd))]
3378#[stable(feature = "simd_x86", since = "1.27.0")]
3379pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3380    unsafe {
3381        let r: i16x16 = simd_shuffle!(
3382            a.as_i16x16(),
3383            b.as_i16x16(),
3384            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3385        );
3386        transmute(r)
3387    }
3388}
3389
3390/// Unpacks and interleave 16-bit integers from the low half of each
3391/// 128-bit lane of `a` and `b`.
3392///
3393/// ```rust
3394/// #[cfg(target_arch = "x86")]
3395/// use std::arch::x86::*;
3396/// #[cfg(target_arch = "x86_64")]
3397/// use std::arch::x86_64::*;
3398///
3399/// # fn main() {
3400/// #     if is_x86_feature_detected!("avx2") {
3401/// #         #[target_feature(enable = "avx2")]
3402/// #         unsafe fn worker() {
3403///
3404/// let a = _mm256_setr_epi16(
3405///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3406/// );
3407/// let b = _mm256_setr_epi16(
3408///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3409/// );
3410///
3411/// let c = _mm256_unpacklo_epi16(a, b);
3412///
3413/// let expected = _mm256_setr_epi16(
3414///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3415/// );
3416/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3417///
3418/// #         }
3419/// #         unsafe { worker(); }
3420/// #     }
3421/// # }
3422/// ```
3423///
3424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3425#[inline]
3426#[target_feature(enable = "avx2")]
3427#[cfg_attr(test, assert_instr(vpunpcklwd))]
3428#[stable(feature = "simd_x86", since = "1.27.0")]
3429pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3430    unsafe {
3431        let r: i16x16 = simd_shuffle!(
3432            a.as_i16x16(),
3433            b.as_i16x16(),
3434            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3435        );
3436        transmute(r)
3437    }
3438}
3439
3440/// Unpacks and interleave 32-bit integers from the high half of each
3441/// 128-bit lane of `a` and `b`.
3442///
3443/// ```rust
3444/// #[cfg(target_arch = "x86")]
3445/// use std::arch::x86::*;
3446/// #[cfg(target_arch = "x86_64")]
3447/// use std::arch::x86_64::*;
3448///
3449/// # fn main() {
3450/// #     if is_x86_feature_detected!("avx2") {
3451/// #         #[target_feature(enable = "avx2")]
3452/// #         unsafe fn worker() {
3453/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3454/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3455///
3456/// let c = _mm256_unpackhi_epi32(a, b);
3457///
3458/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3459/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3460///
3461/// #         }
3462/// #         unsafe { worker(); }
3463/// #     }
3464/// # }
3465/// ```
3466///
3467/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3468#[inline]
3469#[target_feature(enable = "avx2")]
3470#[cfg_attr(test, assert_instr(vunpckhps))]
3471#[stable(feature = "simd_x86", since = "1.27.0")]
3472pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3473    unsafe {
3474        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3475        transmute(r)
3476    }
3477}
3478
3479/// Unpacks and interleave 32-bit integers from the low half of each
3480/// 128-bit lane of `a` and `b`.
3481///
3482/// ```rust
3483/// #[cfg(target_arch = "x86")]
3484/// use std::arch::x86::*;
3485/// #[cfg(target_arch = "x86_64")]
3486/// use std::arch::x86_64::*;
3487///
3488/// # fn main() {
3489/// #     if is_x86_feature_detected!("avx2") {
3490/// #         #[target_feature(enable = "avx2")]
3491/// #         unsafe fn worker() {
3492/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3493/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3494///
3495/// let c = _mm256_unpacklo_epi32(a, b);
3496///
3497/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3498/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3499///
3500/// #         }
3501/// #         unsafe { worker(); }
3502/// #     }
3503/// # }
3504/// ```
3505///
3506/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3507#[inline]
3508#[target_feature(enable = "avx2")]
3509#[cfg_attr(test, assert_instr(vunpcklps))]
3510#[stable(feature = "simd_x86", since = "1.27.0")]
3511pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3512    unsafe {
3513        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3514        transmute(r)
3515    }
3516}
3517
3518/// Unpacks and interleave 64-bit integers from the high half of each
3519/// 128-bit lane of `a` and `b`.
3520///
3521/// ```rust
3522/// #[cfg(target_arch = "x86")]
3523/// use std::arch::x86::*;
3524/// #[cfg(target_arch = "x86_64")]
3525/// use std::arch::x86_64::*;
3526///
3527/// # fn main() {
3528/// #     if is_x86_feature_detected!("avx2") {
3529/// #         #[target_feature(enable = "avx2")]
3530/// #         unsafe fn worker() {
3531/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3532/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3533///
3534/// let c = _mm256_unpackhi_epi64(a, b);
3535///
3536/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3537/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3538///
3539/// #         }
3540/// #         unsafe { worker(); }
3541/// #     }
3542/// # }
3543/// ```
3544///
3545/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3546#[inline]
3547#[target_feature(enable = "avx2")]
3548#[cfg_attr(test, assert_instr(vunpckhpd))]
3549#[stable(feature = "simd_x86", since = "1.27.0")]
3550pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3551    unsafe {
3552        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3553        transmute(r)
3554    }
3555}
3556
3557/// Unpacks and interleave 64-bit integers from the low half of each
3558/// 128-bit lane of `a` and `b`.
3559///
3560/// ```rust
3561/// #[cfg(target_arch = "x86")]
3562/// use std::arch::x86::*;
3563/// #[cfg(target_arch = "x86_64")]
3564/// use std::arch::x86_64::*;
3565///
3566/// # fn main() {
3567/// #     if is_x86_feature_detected!("avx2") {
3568/// #         #[target_feature(enable = "avx2")]
3569/// #         unsafe fn worker() {
3570/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3571/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3572///
3573/// let c = _mm256_unpacklo_epi64(a, b);
3574///
3575/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3576/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3577///
3578/// #         }
3579/// #         unsafe { worker(); }
3580/// #     }
3581/// # }
3582/// ```
3583///
3584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3585#[inline]
3586#[target_feature(enable = "avx2")]
3587#[cfg_attr(test, assert_instr(vunpcklpd))]
3588#[stable(feature = "simd_x86", since = "1.27.0")]
3589pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3590    unsafe {
3591        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3592        transmute(r)
3593    }
3594}
3595
3596/// Computes the bitwise XOR of 256 bits (representing integer data)
3597/// in `a` and `b`
3598///
3599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3600#[inline]
3601#[target_feature(enable = "avx2")]
3602#[cfg_attr(test, assert_instr(vxorps))]
3603#[stable(feature = "simd_x86", since = "1.27.0")]
3604pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3605    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3606}
3607
3608/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3609/// integer containing the zero-extended integer data.
3610///
3611/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3612///
3613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3614#[inline]
3615#[target_feature(enable = "avx2")]
3616// This intrinsic has no corresponding instruction.
3617#[rustc_legacy_const_generics(1)]
3618#[stable(feature = "simd_x86", since = "1.27.0")]
3619pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3620    static_assert_uimm_bits!(INDEX, 5);
3621    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3622}
3623
3624/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3625/// integer containing the zero-extended integer data.
3626///
3627/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3628///
3629/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3630#[inline]
3631#[target_feature(enable = "avx2")]
3632// This intrinsic has no corresponding instruction.
3633#[rustc_legacy_const_generics(1)]
3634#[stable(feature = "simd_x86", since = "1.27.0")]
3635pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3636    static_assert_uimm_bits!(INDEX, 4);
3637    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3638}
3639
3640#[allow(improper_ctypes)]
3641unsafe extern "C" {
3642    #[link_name = "llvm.x86.avx2.phadd.sw"]
3643    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3644    #[link_name = "llvm.x86.avx2.phsub.sw"]
3645    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3646    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3647    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3648    #[link_name = "llvm.x86.avx2.maskload.d"]
3649    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3650    #[link_name = "llvm.x86.avx2.maskload.d.256"]
3651    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3652    #[link_name = "llvm.x86.avx2.maskload.q"]
3653    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3654    #[link_name = "llvm.x86.avx2.maskload.q.256"]
3655    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3656    #[link_name = "llvm.x86.avx2.maskstore.d"]
3657    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3658    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3659    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3660    #[link_name = "llvm.x86.avx2.maskstore.q"]
3661    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3662    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3663    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3664    #[link_name = "llvm.x86.avx2.mpsadbw"]
3665    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3666    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3667    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3668    #[link_name = "llvm.x86.avx2.packsswb"]
3669    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3670    #[link_name = "llvm.x86.avx2.packssdw"]
3671    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3672    #[link_name = "llvm.x86.avx2.packuswb"]
3673    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3674    #[link_name = "llvm.x86.avx2.packusdw"]
3675    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3676    #[link_name = "llvm.x86.avx2.psad.bw"]
3677    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3678    #[link_name = "llvm.x86.avx2.psign.b"]
3679    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3680    #[link_name = "llvm.x86.avx2.psign.w"]
3681    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3682    #[link_name = "llvm.x86.avx2.psign.d"]
3683    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3684    #[link_name = "llvm.x86.avx2.psll.w"]
3685    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3686    #[link_name = "llvm.x86.avx2.psll.d"]
3687    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3688    #[link_name = "llvm.x86.avx2.psll.q"]
3689    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3690    #[link_name = "llvm.x86.avx2.psllv.d"]
3691    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3692    #[link_name = "llvm.x86.avx2.psllv.d.256"]
3693    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3694    #[link_name = "llvm.x86.avx2.psllv.q"]
3695    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3696    #[link_name = "llvm.x86.avx2.psllv.q.256"]
3697    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3698    #[link_name = "llvm.x86.avx2.psra.w"]
3699    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3700    #[link_name = "llvm.x86.avx2.psra.d"]
3701    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3702    #[link_name = "llvm.x86.avx2.psrav.d"]
3703    fn psravd(a: i32x4, count: i32x4) -> i32x4;
3704    #[link_name = "llvm.x86.avx2.psrav.d.256"]
3705    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3706    #[link_name = "llvm.x86.avx2.psrl.w"]
3707    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3708    #[link_name = "llvm.x86.avx2.psrl.d"]
3709    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3710    #[link_name = "llvm.x86.avx2.psrl.q"]
3711    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3712    #[link_name = "llvm.x86.avx2.psrlv.d"]
3713    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3714    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3715    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3716    #[link_name = "llvm.x86.avx2.psrlv.q"]
3717    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3718    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3719    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3720    #[link_name = "llvm.x86.avx2.pshuf.b"]
3721    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3722    #[link_name = "llvm.x86.avx2.permd"]
3723    fn permd(a: u32x8, b: u32x8) -> u32x8;
3724    #[link_name = "llvm.x86.avx2.permps"]
3725    fn permps(a: __m256, b: i32x8) -> __m256;
3726    #[link_name = "llvm.x86.avx2.gather.d.d"]
3727    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3728    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3729    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3730    #[link_name = "llvm.x86.avx2.gather.d.q"]
3731    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3732    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3733    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3734    #[link_name = "llvm.x86.avx2.gather.q.d"]
3735    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3736    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3737    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3738    #[link_name = "llvm.x86.avx2.gather.q.q"]
3739    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3740    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3741    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3742    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3743    fn pgatherdpd(
3744        src: __m128d,
3745        slice: *const i8,
3746        offsets: i32x4,
3747        mask: __m128d,
3748        scale: i8,
3749    ) -> __m128d;
3750    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3751    fn vpgatherdpd(
3752        src: __m256d,
3753        slice: *const i8,
3754        offsets: i32x4,
3755        mask: __m256d,
3756        scale: i8,
3757    ) -> __m256d;
3758    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3759    fn pgatherqpd(
3760        src: __m128d,
3761        slice: *const i8,
3762        offsets: i64x2,
3763        mask: __m128d,
3764        scale: i8,
3765    ) -> __m128d;
3766    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3767    fn vpgatherqpd(
3768        src: __m256d,
3769        slice: *const i8,
3770        offsets: i64x4,
3771        mask: __m256d,
3772        scale: i8,
3773    ) -> __m256d;
3774    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3775    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3776    -> __m128;
3777    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3778    fn vpgatherdps(
3779        src: __m256,
3780        slice: *const i8,
3781        offsets: i32x8,
3782        mask: __m256,
3783        scale: i8,
3784    ) -> __m256;
3785    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3786    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3787    -> __m128;
3788    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3789    fn vpgatherqps(
3790        src: __m128,
3791        slice: *const i8,
3792        offsets: i64x4,
3793        mask: __m128,
3794        scale: i8,
3795    ) -> __m128;
3796}
3797
3798#[cfg(test)]
3799mod tests {
3800
3801    use stdarch_test::simd_test;
3802
3803    use crate::core_arch::x86::*;
3804
3805    #[simd_test(enable = "avx2")]
3806    unsafe fn test_mm256_abs_epi32() {
3807        #[rustfmt::skip]
3808        let a = _mm256_setr_epi32(
3809            0, 1, -1, i32::MAX,
3810            i32::MIN, 100, -100, -32,
3811        );
3812        let r = _mm256_abs_epi32(a);
3813        #[rustfmt::skip]
3814        let e = _mm256_setr_epi32(
3815            0, 1, 1, i32::MAX,
3816            i32::MAX.wrapping_add(1), 100, 100, 32,
3817        );
3818        assert_eq_m256i(r, e);
3819    }
3820
3821    #[simd_test(enable = "avx2")]
3822    unsafe fn test_mm256_abs_epi16() {
3823        #[rustfmt::skip]
3824        let a = _mm256_setr_epi16(
3825            0,  1, -1, 2, -2, 3, -3, 4,
3826            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3827        );
3828        let r = _mm256_abs_epi16(a);
3829        #[rustfmt::skip]
3830        let e = _mm256_setr_epi16(
3831            0, 1, 1, 2, 2, 3, 3, 4,
3832            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3833        );
3834        assert_eq_m256i(r, e);
3835    }
3836
3837    #[simd_test(enable = "avx2")]
3838    unsafe fn test_mm256_abs_epi8() {
3839        #[rustfmt::skip]
3840        let a = _mm256_setr_epi8(
3841            0, 1, -1, 2, -2, 3, -3, 4,
3842            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3843            0, 1, -1, 2, -2, 3, -3, 4,
3844            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3845        );
3846        let r = _mm256_abs_epi8(a);
3847        #[rustfmt::skip]
3848        let e = _mm256_setr_epi8(
3849            0, 1, 1, 2, 2, 3, 3, 4,
3850            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3851            0, 1, 1, 2, 2, 3, 3, 4,
3852            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3853        );
3854        assert_eq_m256i(r, e);
3855    }
3856
3857    #[simd_test(enable = "avx2")]
3858    unsafe fn test_mm256_add_epi64() {
3859        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3860        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3861        let r = _mm256_add_epi64(a, b);
3862        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3863        assert_eq_m256i(r, e);
3864    }
3865
3866    #[simd_test(enable = "avx2")]
3867    unsafe fn test_mm256_add_epi32() {
3868        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3869        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3870        let r = _mm256_add_epi32(a, b);
3871        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3872        assert_eq_m256i(r, e);
3873    }
3874
3875    #[simd_test(enable = "avx2")]
3876    unsafe fn test_mm256_add_epi16() {
3877        #[rustfmt::skip]
3878        let a = _mm256_setr_epi16(
3879            0, 1, 2, 3, 4, 5, 6, 7,
3880            8, 9, 10, 11, 12, 13, 14, 15,
3881        );
3882        #[rustfmt::skip]
3883        let b = _mm256_setr_epi16(
3884            0, 1, 2, 3, 4, 5, 6, 7,
3885            8, 9, 10, 11, 12, 13, 14, 15,
3886        );
3887        let r = _mm256_add_epi16(a, b);
3888        #[rustfmt::skip]
3889        let e = _mm256_setr_epi16(
3890            0, 2, 4, 6, 8, 10, 12, 14,
3891            16, 18, 20, 22, 24, 26, 28, 30,
3892        );
3893        assert_eq_m256i(r, e);
3894    }
3895
3896    #[simd_test(enable = "avx2")]
3897    unsafe fn test_mm256_add_epi8() {
3898        #[rustfmt::skip]
3899        let a = _mm256_setr_epi8(
3900            0, 1, 2, 3, 4, 5, 6, 7,
3901            8, 9, 10, 11, 12, 13, 14, 15,
3902            16, 17, 18, 19, 20, 21, 22, 23,
3903            24, 25, 26, 27, 28, 29, 30, 31,
3904        );
3905        #[rustfmt::skip]
3906        let b = _mm256_setr_epi8(
3907            0, 1, 2, 3, 4, 5, 6, 7,
3908            8, 9, 10, 11, 12, 13, 14, 15,
3909            16, 17, 18, 19, 20, 21, 22, 23,
3910            24, 25, 26, 27, 28, 29, 30, 31,
3911        );
3912        let r = _mm256_add_epi8(a, b);
3913        #[rustfmt::skip]
3914        let e = _mm256_setr_epi8(
3915            0, 2, 4, 6, 8, 10, 12, 14,
3916            16, 18, 20, 22, 24, 26, 28, 30,
3917            32, 34, 36, 38, 40, 42, 44, 46,
3918            48, 50, 52, 54, 56, 58, 60, 62,
3919        );
3920        assert_eq_m256i(r, e);
3921    }
3922
3923    #[simd_test(enable = "avx2")]
3924    unsafe fn test_mm256_adds_epi8() {
3925        #[rustfmt::skip]
3926        let a = _mm256_setr_epi8(
3927            0, 1, 2, 3, 4, 5, 6, 7,
3928            8, 9, 10, 11, 12, 13, 14, 15,
3929            16, 17, 18, 19, 20, 21, 22, 23,
3930            24, 25, 26, 27, 28, 29, 30, 31,
3931        );
3932        #[rustfmt::skip]
3933        let b = _mm256_setr_epi8(
3934            32, 33, 34, 35, 36, 37, 38, 39,
3935            40, 41, 42, 43, 44, 45, 46, 47,
3936            48, 49, 50, 51, 52, 53, 54, 55,
3937            56, 57, 58, 59, 60, 61, 62, 63,
3938        );
3939        let r = _mm256_adds_epi8(a, b);
3940        #[rustfmt::skip]
3941        let e = _mm256_setr_epi8(
3942            32, 34, 36, 38, 40, 42, 44, 46,
3943            48, 50, 52, 54, 56, 58, 60, 62,
3944            64, 66, 68, 70, 72, 74, 76, 78,
3945            80, 82, 84, 86, 88, 90, 92, 94,
3946        );
3947        assert_eq_m256i(r, e);
3948    }
3949
3950    #[simd_test(enable = "avx2")]
3951    unsafe fn test_mm256_adds_epi8_saturate_positive() {
3952        let a = _mm256_set1_epi8(0x7F);
3953        let b = _mm256_set1_epi8(1);
3954        let r = _mm256_adds_epi8(a, b);
3955        assert_eq_m256i(r, a);
3956    }
3957
3958    #[simd_test(enable = "avx2")]
3959    unsafe fn test_mm256_adds_epi8_saturate_negative() {
3960        let a = _mm256_set1_epi8(-0x80);
3961        let b = _mm256_set1_epi8(-1);
3962        let r = _mm256_adds_epi8(a, b);
3963        assert_eq_m256i(r, a);
3964    }
3965
3966    #[simd_test(enable = "avx2")]
3967    unsafe fn test_mm256_adds_epi16() {
3968        #[rustfmt::skip]
3969        let a = _mm256_setr_epi16(
3970            0, 1, 2, 3, 4, 5, 6, 7,
3971            8, 9, 10, 11, 12, 13, 14, 15,
3972        );
3973        #[rustfmt::skip]
3974        let b = _mm256_setr_epi16(
3975            32, 33, 34, 35, 36, 37, 38, 39,
3976            40, 41, 42, 43, 44, 45, 46, 47,
3977        );
3978        let r = _mm256_adds_epi16(a, b);
3979        #[rustfmt::skip]
3980        let e = _mm256_setr_epi16(
3981            32, 34, 36, 38, 40, 42, 44, 46,
3982            48, 50, 52, 54, 56, 58, 60, 62,
3983        );
3984
3985        assert_eq_m256i(r, e);
3986    }
3987
3988    #[simd_test(enable = "avx2")]
3989    unsafe fn test_mm256_adds_epi16_saturate_positive() {
3990        let a = _mm256_set1_epi16(0x7FFF);
3991        let b = _mm256_set1_epi16(1);
3992        let r = _mm256_adds_epi16(a, b);
3993        assert_eq_m256i(r, a);
3994    }
3995
3996    #[simd_test(enable = "avx2")]
3997    unsafe fn test_mm256_adds_epi16_saturate_negative() {
3998        let a = _mm256_set1_epi16(-0x8000);
3999        let b = _mm256_set1_epi16(-1);
4000        let r = _mm256_adds_epi16(a, b);
4001        assert_eq_m256i(r, a);
4002    }
4003
4004    #[simd_test(enable = "avx2")]
4005    unsafe fn test_mm256_adds_epu8() {
4006        #[rustfmt::skip]
4007        let a = _mm256_setr_epi8(
4008            0, 1, 2, 3, 4, 5, 6, 7,
4009            8, 9, 10, 11, 12, 13, 14, 15,
4010            16, 17, 18, 19, 20, 21, 22, 23,
4011            24, 25, 26, 27, 28, 29, 30, 31,
4012        );
4013        #[rustfmt::skip]
4014        let b = _mm256_setr_epi8(
4015            32, 33, 34, 35, 36, 37, 38, 39,
4016            40, 41, 42, 43, 44, 45, 46, 47,
4017            48, 49, 50, 51, 52, 53, 54, 55,
4018            56, 57, 58, 59, 60, 61, 62, 63,
4019        );
4020        let r = _mm256_adds_epu8(a, b);
4021        #[rustfmt::skip]
4022        let e = _mm256_setr_epi8(
4023            32, 34, 36, 38, 40, 42, 44, 46,
4024            48, 50, 52, 54, 56, 58, 60, 62,
4025            64, 66, 68, 70, 72, 74, 76, 78,
4026            80, 82, 84, 86, 88, 90, 92, 94,
4027        );
4028        assert_eq_m256i(r, e);
4029    }
4030
4031    #[simd_test(enable = "avx2")]
4032    unsafe fn test_mm256_adds_epu8_saturate() {
4033        let a = _mm256_set1_epi8(!0);
4034        let b = _mm256_set1_epi8(1);
4035        let r = _mm256_adds_epu8(a, b);
4036        assert_eq_m256i(r, a);
4037    }
4038
4039    #[simd_test(enable = "avx2")]
4040    unsafe fn test_mm256_adds_epu16() {
4041        #[rustfmt::skip]
4042        let a = _mm256_setr_epi16(
4043            0, 1, 2, 3, 4, 5, 6, 7,
4044            8, 9, 10, 11, 12, 13, 14, 15,
4045        );
4046        #[rustfmt::skip]
4047        let b = _mm256_setr_epi16(
4048            32, 33, 34, 35, 36, 37, 38, 39,
4049            40, 41, 42, 43, 44, 45, 46, 47,
4050        );
4051        let r = _mm256_adds_epu16(a, b);
4052        #[rustfmt::skip]
4053        let e = _mm256_setr_epi16(
4054            32, 34, 36, 38, 40, 42, 44, 46,
4055            48, 50, 52, 54, 56, 58, 60, 62,
4056        );
4057
4058        assert_eq_m256i(r, e);
4059    }
4060
4061    #[simd_test(enable = "avx2")]
4062    unsafe fn test_mm256_adds_epu16_saturate() {
4063        let a = _mm256_set1_epi16(!0);
4064        let b = _mm256_set1_epi16(1);
4065        let r = _mm256_adds_epu16(a, b);
4066        assert_eq_m256i(r, a);
4067    }
4068
4069    #[simd_test(enable = "avx2")]
4070    unsafe fn test_mm256_and_si256() {
4071        let a = _mm256_set1_epi8(5);
4072        let b = _mm256_set1_epi8(3);
4073        let got = _mm256_and_si256(a, b);
4074        assert_eq_m256i(got, _mm256_set1_epi8(1));
4075    }
4076
4077    #[simd_test(enable = "avx2")]
4078    unsafe fn test_mm256_andnot_si256() {
4079        let a = _mm256_set1_epi8(5);
4080        let b = _mm256_set1_epi8(3);
4081        let got = _mm256_andnot_si256(a, b);
4082        assert_eq_m256i(got, _mm256_set1_epi8(2));
4083    }
4084
4085    #[simd_test(enable = "avx2")]
4086    unsafe fn test_mm256_avg_epu8() {
4087        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4088        let r = _mm256_avg_epu8(a, b);
4089        assert_eq_m256i(r, _mm256_set1_epi8(6));
4090    }
4091
4092    #[simd_test(enable = "avx2")]
4093    unsafe fn test_mm256_avg_epu16() {
4094        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4095        let r = _mm256_avg_epu16(a, b);
4096        assert_eq_m256i(r, _mm256_set1_epi16(6));
4097    }
4098
4099    #[simd_test(enable = "avx2")]
4100    unsafe fn test_mm_blend_epi32() {
4101        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4102        let e = _mm_setr_epi32(9, 3, 3, 3);
4103        let r = _mm_blend_epi32::<0x01>(a, b);
4104        assert_eq_m128i(r, e);
4105
4106        let r = _mm_blend_epi32::<0x0E>(b, a);
4107        assert_eq_m128i(r, e);
4108    }
4109
4110    #[simd_test(enable = "avx2")]
4111    unsafe fn test_mm256_blend_epi32() {
4112        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4113        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4114        let r = _mm256_blend_epi32::<0x01>(a, b);
4115        assert_eq_m256i(r, e);
4116
4117        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4118        let r = _mm256_blend_epi32::<0x82>(a, b);
4119        assert_eq_m256i(r, e);
4120
4121        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4122        let r = _mm256_blend_epi32::<0x7C>(a, b);
4123        assert_eq_m256i(r, e);
4124    }
4125
4126    #[simd_test(enable = "avx2")]
4127    unsafe fn test_mm256_blend_epi16() {
4128        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4129        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4130        let r = _mm256_blend_epi16::<0x01>(a, b);
4131        assert_eq_m256i(r, e);
4132
4133        let r = _mm256_blend_epi16::<0xFE>(b, a);
4134        assert_eq_m256i(r, e);
4135    }
4136
4137    #[simd_test(enable = "avx2")]
4138    unsafe fn test_mm256_blendv_epi8() {
4139        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4140        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4141        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4142        let r = _mm256_blendv_epi8(a, b, mask);
4143        assert_eq_m256i(r, e);
4144    }
4145
4146    #[simd_test(enable = "avx2")]
4147    unsafe fn test_mm_broadcastb_epi8() {
4148        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4149        let res = _mm_broadcastb_epi8(a);
4150        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4151    }
4152
4153    #[simd_test(enable = "avx2")]
4154    unsafe fn test_mm256_broadcastb_epi8() {
4155        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4156        let res = _mm256_broadcastb_epi8(a);
4157        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4158    }
4159
4160    #[simd_test(enable = "avx2")]
4161    unsafe fn test_mm_broadcastd_epi32() {
4162        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4163        let res = _mm_broadcastd_epi32(a);
4164        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4165    }
4166
4167    #[simd_test(enable = "avx2")]
4168    unsafe fn test_mm256_broadcastd_epi32() {
4169        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4170        let res = _mm256_broadcastd_epi32(a);
4171        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4172    }
4173
4174    #[simd_test(enable = "avx2")]
4175    unsafe fn test_mm_broadcastq_epi64() {
4176        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4177        let res = _mm_broadcastq_epi64(a);
4178        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4179    }
4180
4181    #[simd_test(enable = "avx2")]
4182    unsafe fn test_mm256_broadcastq_epi64() {
4183        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4184        let res = _mm256_broadcastq_epi64(a);
4185        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4186    }
4187
4188    #[simd_test(enable = "avx2")]
4189    unsafe fn test_mm_broadcastsd_pd() {
4190        let a = _mm_setr_pd(6.88, 3.44);
4191        let res = _mm_broadcastsd_pd(a);
4192        assert_eq_m128d(res, _mm_set1_pd(6.88));
4193    }
4194
4195    #[simd_test(enable = "avx2")]
4196    unsafe fn test_mm256_broadcastsd_pd() {
4197        let a = _mm_setr_pd(6.88, 3.44);
4198        let res = _mm256_broadcastsd_pd(a);
4199        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4200    }
4201
4202    #[simd_test(enable = "avx2")]
4203    unsafe fn test_mm_broadcastsi128_si256() {
4204        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4205        let res = _mm_broadcastsi128_si256(a);
4206        let retval = _mm256_setr_epi64x(
4207            0x0987654321012334,
4208            0x5678909876543210,
4209            0x0987654321012334,
4210            0x5678909876543210,
4211        );
4212        assert_eq_m256i(res, retval);
4213    }
4214
4215    #[simd_test(enable = "avx2")]
4216    unsafe fn test_mm256_broadcastsi128_si256() {
4217        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4218        let res = _mm256_broadcastsi128_si256(a);
4219        let retval = _mm256_setr_epi64x(
4220            0x0987654321012334,
4221            0x5678909876543210,
4222            0x0987654321012334,
4223            0x5678909876543210,
4224        );
4225        assert_eq_m256i(res, retval);
4226    }
4227
4228    #[simd_test(enable = "avx2")]
4229    unsafe fn test_mm_broadcastss_ps() {
4230        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4231        let res = _mm_broadcastss_ps(a);
4232        assert_eq_m128(res, _mm_set1_ps(6.88));
4233    }
4234
4235    #[simd_test(enable = "avx2")]
4236    unsafe fn test_mm256_broadcastss_ps() {
4237        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4238        let res = _mm256_broadcastss_ps(a);
4239        assert_eq_m256(res, _mm256_set1_ps(6.88));
4240    }
4241
4242    #[simd_test(enable = "avx2")]
4243    unsafe fn test_mm_broadcastw_epi16() {
4244        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4245        let res = _mm_broadcastw_epi16(a);
4246        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4247    }
4248
4249    #[simd_test(enable = "avx2")]
4250    unsafe fn test_mm256_broadcastw_epi16() {
4251        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4252        let res = _mm256_broadcastw_epi16(a);
4253        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4254    }
4255
4256    #[simd_test(enable = "avx2")]
4257    unsafe fn test_mm256_cmpeq_epi8() {
4258        #[rustfmt::skip]
4259        let a = _mm256_setr_epi8(
4260            0, 1, 2, 3, 4, 5, 6, 7,
4261            8, 9, 10, 11, 12, 13, 14, 15,
4262            16, 17, 18, 19, 20, 21, 22, 23,
4263            24, 25, 26, 27, 28, 29, 30, 31,
4264        );
4265        #[rustfmt::skip]
4266        let b = _mm256_setr_epi8(
4267            31, 30, 2, 28, 27, 26, 25, 24,
4268            23, 22, 21, 20, 19, 18, 17, 16,
4269            15, 14, 13, 12, 11, 10, 9, 8,
4270            7, 6, 5, 4, 3, 2, 1, 0,
4271        );
4272        let r = _mm256_cmpeq_epi8(a, b);
4273        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4274    }
4275
4276    #[simd_test(enable = "avx2")]
4277    unsafe fn test_mm256_cmpeq_epi16() {
4278        #[rustfmt::skip]
4279        let a = _mm256_setr_epi16(
4280            0, 1, 2, 3, 4, 5, 6, 7,
4281            8, 9, 10, 11, 12, 13, 14, 15,
4282        );
4283        #[rustfmt::skip]
4284        let b = _mm256_setr_epi16(
4285            15, 14, 2, 12, 11, 10, 9, 8,
4286            7, 6, 5, 4, 3, 2, 1, 0,
4287        );
4288        let r = _mm256_cmpeq_epi16(a, b);
4289        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4290    }
4291
4292    #[simd_test(enable = "avx2")]
4293    unsafe fn test_mm256_cmpeq_epi32() {
4294        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4295        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4296        let r = _mm256_cmpeq_epi32(a, b);
4297        let e = _mm256_set1_epi32(0);
4298        let e = _mm256_insert_epi32::<2>(e, !0);
4299        assert_eq_m256i(r, e);
4300    }
4301
4302    #[simd_test(enable = "avx2")]
4303    unsafe fn test_mm256_cmpeq_epi64() {
4304        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4305        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4306        let r = _mm256_cmpeq_epi64(a, b);
4307        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4308    }
4309
4310    #[simd_test(enable = "avx2")]
4311    unsafe fn test_mm256_cmpgt_epi8() {
4312        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4313        let b = _mm256_set1_epi8(0);
4314        let r = _mm256_cmpgt_epi8(a, b);
4315        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4316    }
4317
4318    #[simd_test(enable = "avx2")]
4319    unsafe fn test_mm256_cmpgt_epi16() {
4320        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4321        let b = _mm256_set1_epi16(0);
4322        let r = _mm256_cmpgt_epi16(a, b);
4323        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4324    }
4325
4326    #[simd_test(enable = "avx2")]
4327    unsafe fn test_mm256_cmpgt_epi32() {
4328        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4329        let b = _mm256_set1_epi32(0);
4330        let r = _mm256_cmpgt_epi32(a, b);
4331        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4332    }
4333
4334    #[simd_test(enable = "avx2")]
4335    unsafe fn test_mm256_cmpgt_epi64() {
4336        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4337        let b = _mm256_set1_epi64x(0);
4338        let r = _mm256_cmpgt_epi64(a, b);
4339        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4340    }
4341
4342    #[simd_test(enable = "avx2")]
4343    unsafe fn test_mm256_cvtepi8_epi16() {
4344        #[rustfmt::skip]
4345        let a = _mm_setr_epi8(
4346            0, 0, -1, 1, -2, 2, -3, 3,
4347            -4, 4, -5, 5, -6, 6, -7, 7,
4348        );
4349        #[rustfmt::skip]
4350        let r = _mm256_setr_epi16(
4351            0, 0, -1, 1, -2, 2, -3, 3,
4352            -4, 4, -5, 5, -6, 6, -7, 7,
4353        );
4354        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4355    }
4356
4357    #[simd_test(enable = "avx2")]
4358    unsafe fn test_mm256_cvtepi8_epi32() {
4359        #[rustfmt::skip]
4360        let a = _mm_setr_epi8(
4361            0, 0, -1, 1, -2, 2, -3, 3,
4362            -4, 4, -5, 5, -6, 6, -7, 7,
4363        );
4364        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4365        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4366    }
4367
4368    #[simd_test(enable = "avx2")]
4369    unsafe fn test_mm256_cvtepi8_epi64() {
4370        #[rustfmt::skip]
4371        let a = _mm_setr_epi8(
4372            0, 0, -1, 1, -2, 2, -3, 3,
4373            -4, 4, -5, 5, -6, 6, -7, 7,
4374        );
4375        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4376        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4377    }
4378
4379    #[simd_test(enable = "avx2")]
4380    unsafe fn test_mm256_cvtepi16_epi32() {
4381        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4382        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4383        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4384    }
4385
4386    #[simd_test(enable = "avx2")]
4387    unsafe fn test_mm256_cvtepi16_epi64() {
4388        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4389        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4390        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4391    }
4392
4393    #[simd_test(enable = "avx2")]
4394    unsafe fn test_mm256_cvtepi32_epi64() {
4395        let a = _mm_setr_epi32(0, 0, -1, 1);
4396        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4397        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4398    }
4399
4400    #[simd_test(enable = "avx2")]
4401    unsafe fn test_mm256_cvtepu16_epi32() {
4402        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4403        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4404        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4405    }
4406
4407    #[simd_test(enable = "avx2")]
4408    unsafe fn test_mm256_cvtepu16_epi64() {
4409        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4410        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4411        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4412    }
4413
4414    #[simd_test(enable = "avx2")]
4415    unsafe fn test_mm256_cvtepu32_epi64() {
4416        let a = _mm_setr_epi32(0, 1, 2, 3);
4417        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4418        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4419    }
4420
4421    #[simd_test(enable = "avx2")]
4422    unsafe fn test_mm256_cvtepu8_epi16() {
4423        #[rustfmt::skip]
4424        let a = _mm_setr_epi8(
4425            0, 1, 2, 3, 4, 5, 6, 7,
4426            8, 9, 10, 11, 12, 13, 14, 15,
4427        );
4428        #[rustfmt::skip]
4429        let r = _mm256_setr_epi16(
4430            0, 1, 2, 3, 4, 5, 6, 7,
4431            8, 9, 10, 11, 12, 13, 14, 15,
4432        );
4433        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4434    }
4435
4436    #[simd_test(enable = "avx2")]
4437    unsafe fn test_mm256_cvtepu8_epi32() {
4438        #[rustfmt::skip]
4439        let a = _mm_setr_epi8(
4440            0, 1, 2, 3, 4, 5, 6, 7,
4441            8, 9, 10, 11, 12, 13, 14, 15,
4442        );
4443        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4444        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4445    }
4446
4447    #[simd_test(enable = "avx2")]
4448    unsafe fn test_mm256_cvtepu8_epi64() {
4449        #[rustfmt::skip]
4450        let a = _mm_setr_epi8(
4451            0, 1, 2, 3, 4, 5, 6, 7,
4452            8, 9, 10, 11, 12, 13, 14, 15,
4453        );
4454        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4455        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4456    }
4457
4458    #[simd_test(enable = "avx2")]
4459    unsafe fn test_mm256_extracti128_si256() {
4460        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4461        let r = _mm256_extracti128_si256::<1>(a);
4462        let e = _mm_setr_epi64x(3, 4);
4463        assert_eq_m128i(r, e);
4464    }
4465
4466    #[simd_test(enable = "avx2")]
4467    unsafe fn test_mm256_hadd_epi16() {
4468        let a = _mm256_set1_epi16(2);
4469        let b = _mm256_set1_epi16(4);
4470        let r = _mm256_hadd_epi16(a, b);
4471        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4472        assert_eq_m256i(r, e);
4473    }
4474
4475    #[simd_test(enable = "avx2")]
4476    unsafe fn test_mm256_hadd_epi32() {
4477        let a = _mm256_set1_epi32(2);
4478        let b = _mm256_set1_epi32(4);
4479        let r = _mm256_hadd_epi32(a, b);
4480        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4481        assert_eq_m256i(r, e);
4482    }
4483
4484    #[simd_test(enable = "avx2")]
4485    unsafe fn test_mm256_hadds_epi16() {
4486        let a = _mm256_set1_epi16(2);
4487        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4488        let a = _mm256_insert_epi16::<1>(a, 1);
4489        let b = _mm256_set1_epi16(4);
4490        let r = _mm256_hadds_epi16(a, b);
4491        #[rustfmt::skip]
4492        let e = _mm256_setr_epi16(
4493            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4494            4, 4, 4, 4, 8, 8, 8, 8,
4495        );
4496        assert_eq_m256i(r, e);
4497    }
4498
4499    #[simd_test(enable = "avx2")]
4500    unsafe fn test_mm256_hsub_epi16() {
4501        let a = _mm256_set1_epi16(2);
4502        let b = _mm256_set1_epi16(4);
4503        let r = _mm256_hsub_epi16(a, b);
4504        let e = _mm256_set1_epi16(0);
4505        assert_eq_m256i(r, e);
4506    }
4507
4508    #[simd_test(enable = "avx2")]
4509    unsafe fn test_mm256_hsub_epi32() {
4510        let a = _mm256_set1_epi32(2);
4511        let b = _mm256_set1_epi32(4);
4512        let r = _mm256_hsub_epi32(a, b);
4513        let e = _mm256_set1_epi32(0);
4514        assert_eq_m256i(r, e);
4515    }
4516
4517    #[simd_test(enable = "avx2")]
4518    unsafe fn test_mm256_hsubs_epi16() {
4519        let a = _mm256_set1_epi16(2);
4520        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4521        let a = _mm256_insert_epi16::<1>(a, -1);
4522        let b = _mm256_set1_epi16(4);
4523        let r = _mm256_hsubs_epi16(a, b);
4524        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4525        assert_eq_m256i(r, e);
4526    }
4527
4528    #[simd_test(enable = "avx2")]
4529    unsafe fn test_mm256_madd_epi16() {
4530        let a = _mm256_set1_epi16(2);
4531        let b = _mm256_set1_epi16(4);
4532        let r = _mm256_madd_epi16(a, b);
4533        let e = _mm256_set1_epi32(16);
4534        assert_eq_m256i(r, e);
4535    }
4536
4537    #[simd_test(enable = "avx2")]
4538    unsafe fn test_mm256_inserti128_si256() {
4539        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4540        let b = _mm_setr_epi64x(7, 8);
4541        let r = _mm256_inserti128_si256::<1>(a, b);
4542        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4543        assert_eq_m256i(r, e);
4544    }
4545
4546    #[simd_test(enable = "avx2")]
4547    unsafe fn test_mm256_maddubs_epi16() {
4548        let a = _mm256_set1_epi8(2);
4549        let b = _mm256_set1_epi8(4);
4550        let r = _mm256_maddubs_epi16(a, b);
4551        let e = _mm256_set1_epi16(16);
4552        assert_eq_m256i(r, e);
4553    }
4554
4555    #[simd_test(enable = "avx2")]
4556    unsafe fn test_mm_maskload_epi32() {
4557        let nums = [1, 2, 3, 4];
4558        let a = &nums as *const i32;
4559        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4560        let r = _mm_maskload_epi32(a, mask);
4561        let e = _mm_setr_epi32(1, 0, 0, 4);
4562        assert_eq_m128i(r, e);
4563    }
4564
4565    #[simd_test(enable = "avx2")]
4566    unsafe fn test_mm256_maskload_epi32() {
4567        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4568        let a = &nums as *const i32;
4569        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4570        let r = _mm256_maskload_epi32(a, mask);
4571        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4572        assert_eq_m256i(r, e);
4573    }
4574
4575    #[simd_test(enable = "avx2")]
4576    unsafe fn test_mm_maskload_epi64() {
4577        let nums = [1_i64, 2_i64];
4578        let a = &nums as *const i64;
4579        let mask = _mm_setr_epi64x(0, -1);
4580        let r = _mm_maskload_epi64(a, mask);
4581        let e = _mm_setr_epi64x(0, 2);
4582        assert_eq_m128i(r, e);
4583    }
4584
4585    #[simd_test(enable = "avx2")]
4586    unsafe fn test_mm256_maskload_epi64() {
4587        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4588        let a = &nums as *const i64;
4589        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4590        let r = _mm256_maskload_epi64(a, mask);
4591        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4592        assert_eq_m256i(r, e);
4593    }
4594
4595    #[simd_test(enable = "avx2")]
4596    unsafe fn test_mm_maskstore_epi32() {
4597        let a = _mm_setr_epi32(1, 2, 3, 4);
4598        let mut arr = [-1, -1, -1, -1];
4599        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4600        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4601        let e = [1, -1, -1, 4];
4602        assert_eq!(arr, e);
4603    }
4604
4605    #[simd_test(enable = "avx2")]
4606    unsafe fn test_mm256_maskstore_epi32() {
4607        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4608        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4609        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4610        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4611        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4612        assert_eq!(arr, e);
4613    }
4614
4615    #[simd_test(enable = "avx2")]
4616    unsafe fn test_mm_maskstore_epi64() {
4617        let a = _mm_setr_epi64x(1_i64, 2_i64);
4618        let mut arr = [-1_i64, -1_i64];
4619        let mask = _mm_setr_epi64x(0, -1);
4620        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4621        let e = [-1, 2];
4622        assert_eq!(arr, e);
4623    }
4624
4625    #[simd_test(enable = "avx2")]
4626    unsafe fn test_mm256_maskstore_epi64() {
4627        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4628        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4629        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4630        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4631        let e = [-1, 2, 3, -1];
4632        assert_eq!(arr, e);
4633    }
4634
4635    #[simd_test(enable = "avx2")]
4636    unsafe fn test_mm256_max_epi16() {
4637        let a = _mm256_set1_epi16(2);
4638        let b = _mm256_set1_epi16(4);
4639        let r = _mm256_max_epi16(a, b);
4640        assert_eq_m256i(r, b);
4641    }
4642
4643    #[simd_test(enable = "avx2")]
4644    unsafe fn test_mm256_max_epi32() {
4645        let a = _mm256_set1_epi32(2);
4646        let b = _mm256_set1_epi32(4);
4647        let r = _mm256_max_epi32(a, b);
4648        assert_eq_m256i(r, b);
4649    }
4650
4651    #[simd_test(enable = "avx2")]
4652    unsafe fn test_mm256_max_epi8() {
4653        let a = _mm256_set1_epi8(2);
4654        let b = _mm256_set1_epi8(4);
4655        let r = _mm256_max_epi8(a, b);
4656        assert_eq_m256i(r, b);
4657    }
4658
4659    #[simd_test(enable = "avx2")]
4660    unsafe fn test_mm256_max_epu16() {
4661        let a = _mm256_set1_epi16(2);
4662        let b = _mm256_set1_epi16(4);
4663        let r = _mm256_max_epu16(a, b);
4664        assert_eq_m256i(r, b);
4665    }
4666
4667    #[simd_test(enable = "avx2")]
4668    unsafe fn test_mm256_max_epu32() {
4669        let a = _mm256_set1_epi32(2);
4670        let b = _mm256_set1_epi32(4);
4671        let r = _mm256_max_epu32(a, b);
4672        assert_eq_m256i(r, b);
4673    }
4674
4675    #[simd_test(enable = "avx2")]
4676    unsafe fn test_mm256_max_epu8() {
4677        let a = _mm256_set1_epi8(2);
4678        let b = _mm256_set1_epi8(4);
4679        let r = _mm256_max_epu8(a, b);
4680        assert_eq_m256i(r, b);
4681    }
4682
4683    #[simd_test(enable = "avx2")]
4684    unsafe fn test_mm256_min_epi16() {
4685        let a = _mm256_set1_epi16(2);
4686        let b = _mm256_set1_epi16(4);
4687        let r = _mm256_min_epi16(a, b);
4688        assert_eq_m256i(r, a);
4689    }
4690
4691    #[simd_test(enable = "avx2")]
4692    unsafe fn test_mm256_min_epi32() {
4693        let a = _mm256_set1_epi32(2);
4694        let b = _mm256_set1_epi32(4);
4695        let r = _mm256_min_epi32(a, b);
4696        assert_eq_m256i(r, a);
4697    }
4698
4699    #[simd_test(enable = "avx2")]
4700    unsafe fn test_mm256_min_epi8() {
4701        let a = _mm256_set1_epi8(2);
4702        let b = _mm256_set1_epi8(4);
4703        let r = _mm256_min_epi8(a, b);
4704        assert_eq_m256i(r, a);
4705    }
4706
4707    #[simd_test(enable = "avx2")]
4708    unsafe fn test_mm256_min_epu16() {
4709        let a = _mm256_set1_epi16(2);
4710        let b = _mm256_set1_epi16(4);
4711        let r = _mm256_min_epu16(a, b);
4712        assert_eq_m256i(r, a);
4713    }
4714
4715    #[simd_test(enable = "avx2")]
4716    unsafe fn test_mm256_min_epu32() {
4717        let a = _mm256_set1_epi32(2);
4718        let b = _mm256_set1_epi32(4);
4719        let r = _mm256_min_epu32(a, b);
4720        assert_eq_m256i(r, a);
4721    }
4722
4723    #[simd_test(enable = "avx2")]
4724    unsafe fn test_mm256_min_epu8() {
4725        let a = _mm256_set1_epi8(2);
4726        let b = _mm256_set1_epi8(4);
4727        let r = _mm256_min_epu8(a, b);
4728        assert_eq_m256i(r, a);
4729    }
4730
4731    #[simd_test(enable = "avx2")]
4732    unsafe fn test_mm256_movemask_epi8() {
4733        let a = _mm256_set1_epi8(-1);
4734        let r = _mm256_movemask_epi8(a);
4735        let e = -1;
4736        assert_eq!(r, e);
4737    }
4738
4739    #[simd_test(enable = "avx2")]
4740    unsafe fn test_mm256_mpsadbw_epu8() {
4741        let a = _mm256_set1_epi8(2);
4742        let b = _mm256_set1_epi8(4);
4743        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4744        let e = _mm256_set1_epi16(8);
4745        assert_eq_m256i(r, e);
4746    }
4747
4748    #[simd_test(enable = "avx2")]
4749    unsafe fn test_mm256_mul_epi32() {
4750        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4751        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4752        let r = _mm256_mul_epi32(a, b);
4753        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4754        assert_eq_m256i(r, e);
4755    }
4756
4757    #[simd_test(enable = "avx2")]
4758    unsafe fn test_mm256_mul_epu32() {
4759        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4760        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4761        let r = _mm256_mul_epu32(a, b);
4762        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4763        assert_eq_m256i(r, e);
4764    }
4765
4766    #[simd_test(enable = "avx2")]
4767    unsafe fn test_mm256_mulhi_epi16() {
4768        let a = _mm256_set1_epi16(6535);
4769        let b = _mm256_set1_epi16(6535);
4770        let r = _mm256_mulhi_epi16(a, b);
4771        let e = _mm256_set1_epi16(651);
4772        assert_eq_m256i(r, e);
4773    }
4774
4775    #[simd_test(enable = "avx2")]
4776    unsafe fn test_mm256_mulhi_epu16() {
4777        let a = _mm256_set1_epi16(6535);
4778        let b = _mm256_set1_epi16(6535);
4779        let r = _mm256_mulhi_epu16(a, b);
4780        let e = _mm256_set1_epi16(651);
4781        assert_eq_m256i(r, e);
4782    }
4783
4784    #[simd_test(enable = "avx2")]
4785    unsafe fn test_mm256_mullo_epi16() {
4786        let a = _mm256_set1_epi16(2);
4787        let b = _mm256_set1_epi16(4);
4788        let r = _mm256_mullo_epi16(a, b);
4789        let e = _mm256_set1_epi16(8);
4790        assert_eq_m256i(r, e);
4791    }
4792
4793    #[simd_test(enable = "avx2")]
4794    unsafe fn test_mm256_mullo_epi32() {
4795        let a = _mm256_set1_epi32(2);
4796        let b = _mm256_set1_epi32(4);
4797        let r = _mm256_mullo_epi32(a, b);
4798        let e = _mm256_set1_epi32(8);
4799        assert_eq_m256i(r, e);
4800    }
4801
4802    #[simd_test(enable = "avx2")]
4803    unsafe fn test_mm256_mulhrs_epi16() {
4804        let a = _mm256_set1_epi16(2);
4805        let b = _mm256_set1_epi16(4);
4806        let r = _mm256_mullo_epi16(a, b);
4807        let e = _mm256_set1_epi16(8);
4808        assert_eq_m256i(r, e);
4809    }
4810
4811    #[simd_test(enable = "avx2")]
4812    unsafe fn test_mm256_or_si256() {
4813        let a = _mm256_set1_epi8(-1);
4814        let b = _mm256_set1_epi8(0);
4815        let r = _mm256_or_si256(a, b);
4816        assert_eq_m256i(r, a);
4817    }
4818
4819    #[simd_test(enable = "avx2")]
4820    unsafe fn test_mm256_packs_epi16() {
4821        let a = _mm256_set1_epi16(2);
4822        let b = _mm256_set1_epi16(4);
4823        let r = _mm256_packs_epi16(a, b);
4824        #[rustfmt::skip]
4825        let e = _mm256_setr_epi8(
4826            2, 2, 2, 2, 2, 2, 2, 2,
4827            4, 4, 4, 4, 4, 4, 4, 4,
4828            2, 2, 2, 2, 2, 2, 2, 2,
4829            4, 4, 4, 4, 4, 4, 4, 4,
4830        );
4831
4832        assert_eq_m256i(r, e);
4833    }
4834
4835    #[simd_test(enable = "avx2")]
4836    unsafe fn test_mm256_packs_epi32() {
4837        let a = _mm256_set1_epi32(2);
4838        let b = _mm256_set1_epi32(4);
4839        let r = _mm256_packs_epi32(a, b);
4840        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4841
4842        assert_eq_m256i(r, e);
4843    }
4844
4845    #[simd_test(enable = "avx2")]
4846    unsafe fn test_mm256_packus_epi16() {
4847        let a = _mm256_set1_epi16(2);
4848        let b = _mm256_set1_epi16(4);
4849        let r = _mm256_packus_epi16(a, b);
4850        #[rustfmt::skip]
4851        let e = _mm256_setr_epi8(
4852            2, 2, 2, 2, 2, 2, 2, 2,
4853            4, 4, 4, 4, 4, 4, 4, 4,
4854            2, 2, 2, 2, 2, 2, 2, 2,
4855            4, 4, 4, 4, 4, 4, 4, 4,
4856        );
4857
4858        assert_eq_m256i(r, e);
4859    }
4860
4861    #[simd_test(enable = "avx2")]
4862    unsafe fn test_mm256_packus_epi32() {
4863        let a = _mm256_set1_epi32(2);
4864        let b = _mm256_set1_epi32(4);
4865        let r = _mm256_packus_epi32(a, b);
4866        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4867
4868        assert_eq_m256i(r, e);
4869    }
4870
4871    #[simd_test(enable = "avx2")]
4872    unsafe fn test_mm256_sad_epu8() {
4873        let a = _mm256_set1_epi8(2);
4874        let b = _mm256_set1_epi8(4);
4875        let r = _mm256_sad_epu8(a, b);
4876        let e = _mm256_set1_epi64x(16);
4877        assert_eq_m256i(r, e);
4878    }
4879
4880    #[simd_test(enable = "avx2")]
4881    unsafe fn test_mm256_shufflehi_epi16() {
4882        #[rustfmt::skip]
4883        let a = _mm256_setr_epi16(
4884            0, 1, 2, 3, 11, 22, 33, 44,
4885            4, 5, 6, 7, 55, 66, 77, 88,
4886        );
4887        #[rustfmt::skip]
4888        let e = _mm256_setr_epi16(
4889            0, 1, 2, 3, 44, 22, 22, 11,
4890            4, 5, 6, 7, 88, 66, 66, 55,
4891        );
4892        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
4893        assert_eq_m256i(r, e);
4894    }
4895
4896    #[simd_test(enable = "avx2")]
4897    unsafe fn test_mm256_shufflelo_epi16() {
4898        #[rustfmt::skip]
4899        let a = _mm256_setr_epi16(
4900            11, 22, 33, 44, 0, 1, 2, 3,
4901            55, 66, 77, 88, 4, 5, 6, 7,
4902        );
4903        #[rustfmt::skip]
4904        let e = _mm256_setr_epi16(
4905            44, 22, 22, 11, 0, 1, 2, 3,
4906            88, 66, 66, 55, 4, 5, 6, 7,
4907        );
4908        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
4909        assert_eq_m256i(r, e);
4910    }
4911
4912    #[simd_test(enable = "avx2")]
4913    unsafe fn test_mm256_sign_epi16() {
4914        let a = _mm256_set1_epi16(2);
4915        let b = _mm256_set1_epi16(-1);
4916        let r = _mm256_sign_epi16(a, b);
4917        let e = _mm256_set1_epi16(-2);
4918        assert_eq_m256i(r, e);
4919    }
4920
4921    #[simd_test(enable = "avx2")]
4922    unsafe fn test_mm256_sign_epi32() {
4923        let a = _mm256_set1_epi32(2);
4924        let b = _mm256_set1_epi32(-1);
4925        let r = _mm256_sign_epi32(a, b);
4926        let e = _mm256_set1_epi32(-2);
4927        assert_eq_m256i(r, e);
4928    }
4929
4930    #[simd_test(enable = "avx2")]
4931    unsafe fn test_mm256_sign_epi8() {
4932        let a = _mm256_set1_epi8(2);
4933        let b = _mm256_set1_epi8(-1);
4934        let r = _mm256_sign_epi8(a, b);
4935        let e = _mm256_set1_epi8(-2);
4936        assert_eq_m256i(r, e);
4937    }
4938
4939    #[simd_test(enable = "avx2")]
4940    unsafe fn test_mm256_sll_epi16() {
4941        let a = _mm256_set1_epi16(0xFF);
4942        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
4943        let r = _mm256_sll_epi16(a, b);
4944        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
4945    }
4946
4947    #[simd_test(enable = "avx2")]
4948    unsafe fn test_mm256_sll_epi32() {
4949        let a = _mm256_set1_epi32(0xFFFF);
4950        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
4951        let r = _mm256_sll_epi32(a, b);
4952        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
4953    }
4954
4955    #[simd_test(enable = "avx2")]
4956    unsafe fn test_mm256_sll_epi64() {
4957        let a = _mm256_set1_epi64x(0xFFFFFFFF);
4958        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
4959        let r = _mm256_sll_epi64(a, b);
4960        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
4961    }
4962
4963    #[simd_test(enable = "avx2")]
4964    unsafe fn test_mm256_slli_epi16() {
4965        assert_eq_m256i(
4966            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
4967            _mm256_set1_epi16(0xFF0),
4968        );
4969    }
4970
4971    #[simd_test(enable = "avx2")]
4972    unsafe fn test_mm256_slli_epi32() {
4973        assert_eq_m256i(
4974            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
4975            _mm256_set1_epi32(0xFFFF0),
4976        );
4977    }
4978
4979    #[simd_test(enable = "avx2")]
4980    unsafe fn test_mm256_slli_epi64() {
4981        assert_eq_m256i(
4982            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
4983            _mm256_set1_epi64x(0xFFFFFFFF0),
4984        );
4985    }
4986
4987    #[simd_test(enable = "avx2")]
4988    unsafe fn test_mm256_slli_si256() {
4989        let a = _mm256_set1_epi64x(0xFFFFFFFF);
4990        let r = _mm256_slli_si256::<3>(a);
4991        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
4992    }
4993
4994    #[simd_test(enable = "avx2")]
4995    unsafe fn test_mm_sllv_epi32() {
4996        let a = _mm_set1_epi32(2);
4997        let b = _mm_set1_epi32(1);
4998        let r = _mm_sllv_epi32(a, b);
4999        let e = _mm_set1_epi32(4);
5000        assert_eq_m128i(r, e);
5001    }
5002
5003    #[simd_test(enable = "avx2")]
5004    unsafe fn test_mm256_sllv_epi32() {
5005        let a = _mm256_set1_epi32(2);
5006        let b = _mm256_set1_epi32(1);
5007        let r = _mm256_sllv_epi32(a, b);
5008        let e = _mm256_set1_epi32(4);
5009        assert_eq_m256i(r, e);
5010    }
5011
5012    #[simd_test(enable = "avx2")]
5013    unsafe fn test_mm_sllv_epi64() {
5014        let a = _mm_set1_epi64x(2);
5015        let b = _mm_set1_epi64x(1);
5016        let r = _mm_sllv_epi64(a, b);
5017        let e = _mm_set1_epi64x(4);
5018        assert_eq_m128i(r, e);
5019    }
5020
5021    #[simd_test(enable = "avx2")]
5022    unsafe fn test_mm256_sllv_epi64() {
5023        let a = _mm256_set1_epi64x(2);
5024        let b = _mm256_set1_epi64x(1);
5025        let r = _mm256_sllv_epi64(a, b);
5026        let e = _mm256_set1_epi64x(4);
5027        assert_eq_m256i(r, e);
5028    }
5029
5030    #[simd_test(enable = "avx2")]
5031    unsafe fn test_mm256_sra_epi16() {
5032        let a = _mm256_set1_epi16(-1);
5033        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5034        let r = _mm256_sra_epi16(a, b);
5035        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5036    }
5037
5038    #[simd_test(enable = "avx2")]
5039    unsafe fn test_mm256_sra_epi32() {
5040        let a = _mm256_set1_epi32(-1);
5041        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5042        let r = _mm256_sra_epi32(a, b);
5043        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5044    }
5045
5046    #[simd_test(enable = "avx2")]
5047    unsafe fn test_mm256_srai_epi16() {
5048        assert_eq_m256i(
5049            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5050            _mm256_set1_epi16(-1),
5051        );
5052    }
5053
5054    #[simd_test(enable = "avx2")]
5055    unsafe fn test_mm256_srai_epi32() {
5056        assert_eq_m256i(
5057            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5058            _mm256_set1_epi32(-1),
5059        );
5060    }
5061
5062    #[simd_test(enable = "avx2")]
5063    unsafe fn test_mm_srav_epi32() {
5064        let a = _mm_set1_epi32(4);
5065        let count = _mm_set1_epi32(1);
5066        let r = _mm_srav_epi32(a, count);
5067        let e = _mm_set1_epi32(2);
5068        assert_eq_m128i(r, e);
5069    }
5070
5071    #[simd_test(enable = "avx2")]
5072    unsafe fn test_mm256_srav_epi32() {
5073        let a = _mm256_set1_epi32(4);
5074        let count = _mm256_set1_epi32(1);
5075        let r = _mm256_srav_epi32(a, count);
5076        let e = _mm256_set1_epi32(2);
5077        assert_eq_m256i(r, e);
5078    }
5079
5080    #[simd_test(enable = "avx2")]
5081    unsafe fn test_mm256_srli_si256() {
5082        #[rustfmt::skip]
5083        let a = _mm256_setr_epi8(
5084            1, 2, 3, 4, 5, 6, 7, 8,
5085            9, 10, 11, 12, 13, 14, 15, 16,
5086            17, 18, 19, 20, 21, 22, 23, 24,
5087            25, 26, 27, 28, 29, 30, 31, 32,
5088        );
5089        let r = _mm256_srli_si256::<3>(a);
5090        #[rustfmt::skip]
5091        let e = _mm256_setr_epi8(
5092            4, 5, 6, 7, 8, 9, 10, 11,
5093            12, 13, 14, 15, 16, 0, 0, 0,
5094            20, 21, 22, 23, 24, 25, 26, 27,
5095            28, 29, 30, 31, 32, 0, 0, 0,
5096        );
5097        assert_eq_m256i(r, e);
5098    }
5099
5100    #[simd_test(enable = "avx2")]
5101    unsafe fn test_mm256_srl_epi16() {
5102        let a = _mm256_set1_epi16(0xFF);
5103        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5104        let r = _mm256_srl_epi16(a, b);
5105        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5106    }
5107
5108    #[simd_test(enable = "avx2")]
5109    unsafe fn test_mm256_srl_epi32() {
5110        let a = _mm256_set1_epi32(0xFFFF);
5111        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5112        let r = _mm256_srl_epi32(a, b);
5113        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5114    }
5115
5116    #[simd_test(enable = "avx2")]
5117    unsafe fn test_mm256_srl_epi64() {
5118        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5119        let b = _mm_setr_epi64x(4, 0);
5120        let r = _mm256_srl_epi64(a, b);
5121        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5122    }
5123
5124    #[simd_test(enable = "avx2")]
5125    unsafe fn test_mm256_srli_epi16() {
5126        assert_eq_m256i(
5127            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5128            _mm256_set1_epi16(0xF),
5129        );
5130    }
5131
5132    #[simd_test(enable = "avx2")]
5133    unsafe fn test_mm256_srli_epi32() {
5134        assert_eq_m256i(
5135            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5136            _mm256_set1_epi32(0xFFF),
5137        );
5138    }
5139
5140    #[simd_test(enable = "avx2")]
5141    unsafe fn test_mm256_srli_epi64() {
5142        assert_eq_m256i(
5143            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5144            _mm256_set1_epi64x(0xFFFFFFF),
5145        );
5146    }
5147
5148    #[simd_test(enable = "avx2")]
5149    unsafe fn test_mm_srlv_epi32() {
5150        let a = _mm_set1_epi32(2);
5151        let count = _mm_set1_epi32(1);
5152        let r = _mm_srlv_epi32(a, count);
5153        let e = _mm_set1_epi32(1);
5154        assert_eq_m128i(r, e);
5155    }
5156
5157    #[simd_test(enable = "avx2")]
5158    unsafe fn test_mm256_srlv_epi32() {
5159        let a = _mm256_set1_epi32(2);
5160        let count = _mm256_set1_epi32(1);
5161        let r = _mm256_srlv_epi32(a, count);
5162        let e = _mm256_set1_epi32(1);
5163        assert_eq_m256i(r, e);
5164    }
5165
5166    #[simd_test(enable = "avx2")]
5167    unsafe fn test_mm_srlv_epi64() {
5168        let a = _mm_set1_epi64x(2);
5169        let count = _mm_set1_epi64x(1);
5170        let r = _mm_srlv_epi64(a, count);
5171        let e = _mm_set1_epi64x(1);
5172        assert_eq_m128i(r, e);
5173    }
5174
5175    #[simd_test(enable = "avx2")]
5176    unsafe fn test_mm256_srlv_epi64() {
5177        let a = _mm256_set1_epi64x(2);
5178        let count = _mm256_set1_epi64x(1);
5179        let r = _mm256_srlv_epi64(a, count);
5180        let e = _mm256_set1_epi64x(1);
5181        assert_eq_m256i(r, e);
5182    }
5183
5184    #[simd_test(enable = "avx2")]
5185    unsafe fn test_mm256_stream_load_si256() {
5186        let a = _mm256_set_epi64x(5, 6, 7, 8);
5187        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5188        assert_eq_m256i(a, r);
5189    }
5190
5191    #[simd_test(enable = "avx2")]
5192    unsafe fn test_mm256_sub_epi16() {
5193        let a = _mm256_set1_epi16(4);
5194        let b = _mm256_set1_epi16(2);
5195        let r = _mm256_sub_epi16(a, b);
5196        assert_eq_m256i(r, b);
5197    }
5198
5199    #[simd_test(enable = "avx2")]
5200    unsafe fn test_mm256_sub_epi32() {
5201        let a = _mm256_set1_epi32(4);
5202        let b = _mm256_set1_epi32(2);
5203        let r = _mm256_sub_epi32(a, b);
5204        assert_eq_m256i(r, b);
5205    }
5206
5207    #[simd_test(enable = "avx2")]
5208    unsafe fn test_mm256_sub_epi64() {
5209        let a = _mm256_set1_epi64x(4);
5210        let b = _mm256_set1_epi64x(2);
5211        let r = _mm256_sub_epi64(a, b);
5212        assert_eq_m256i(r, b);
5213    }
5214
5215    #[simd_test(enable = "avx2")]
5216    unsafe fn test_mm256_sub_epi8() {
5217        let a = _mm256_set1_epi8(4);
5218        let b = _mm256_set1_epi8(2);
5219        let r = _mm256_sub_epi8(a, b);
5220        assert_eq_m256i(r, b);
5221    }
5222
5223    #[simd_test(enable = "avx2")]
5224    unsafe fn test_mm256_subs_epi16() {
5225        let a = _mm256_set1_epi16(4);
5226        let b = _mm256_set1_epi16(2);
5227        let r = _mm256_subs_epi16(a, b);
5228        assert_eq_m256i(r, b);
5229    }
5230
5231    #[simd_test(enable = "avx2")]
5232    unsafe fn test_mm256_subs_epi8() {
5233        let a = _mm256_set1_epi8(4);
5234        let b = _mm256_set1_epi8(2);
5235        let r = _mm256_subs_epi8(a, b);
5236        assert_eq_m256i(r, b);
5237    }
5238
5239    #[simd_test(enable = "avx2")]
5240    unsafe fn test_mm256_subs_epu16() {
5241        let a = _mm256_set1_epi16(4);
5242        let b = _mm256_set1_epi16(2);
5243        let r = _mm256_subs_epu16(a, b);
5244        assert_eq_m256i(r, b);
5245    }
5246
5247    #[simd_test(enable = "avx2")]
5248    unsafe fn test_mm256_subs_epu8() {
5249        let a = _mm256_set1_epi8(4);
5250        let b = _mm256_set1_epi8(2);
5251        let r = _mm256_subs_epu8(a, b);
5252        assert_eq_m256i(r, b);
5253    }
5254
5255    #[simd_test(enable = "avx2")]
5256    unsafe fn test_mm256_xor_si256() {
5257        let a = _mm256_set1_epi8(5);
5258        let b = _mm256_set1_epi8(3);
5259        let r = _mm256_xor_si256(a, b);
5260        assert_eq_m256i(r, _mm256_set1_epi8(6));
5261    }
5262
5263    #[simd_test(enable = "avx2")]
5264    unsafe fn test_mm256_alignr_epi8() {
5265        #[rustfmt::skip]
5266        let a = _mm256_setr_epi8(
5267            1, 2, 3, 4, 5, 6, 7, 8,
5268            9, 10, 11, 12, 13, 14, 15, 16,
5269            17, 18, 19, 20, 21, 22, 23, 24,
5270            25, 26, 27, 28, 29, 30, 31, 32,
5271        );
5272        #[rustfmt::skip]
5273        let b = _mm256_setr_epi8(
5274            -1, -2, -3, -4, -5, -6, -7, -8,
5275            -9, -10, -11, -12, -13, -14, -15, -16,
5276            -17, -18, -19, -20, -21, -22, -23, -24,
5277            -25, -26, -27, -28, -29, -30, -31, -32,
5278        );
5279        let r = _mm256_alignr_epi8::<33>(a, b);
5280        assert_eq_m256i(r, _mm256_set1_epi8(0));
5281
5282        let r = _mm256_alignr_epi8::<17>(a, b);
5283        #[rustfmt::skip]
5284        let expected = _mm256_setr_epi8(
5285            2, 3, 4, 5, 6, 7, 8, 9,
5286            10, 11, 12, 13, 14, 15, 16, 0,
5287            18, 19, 20, 21, 22, 23, 24, 25,
5288            26, 27, 28, 29, 30, 31, 32, 0,
5289        );
5290        assert_eq_m256i(r, expected);
5291
5292        let r = _mm256_alignr_epi8::<4>(a, b);
5293        #[rustfmt::skip]
5294        let expected = _mm256_setr_epi8(
5295            -5, -6, -7, -8, -9, -10, -11, -12,
5296            -13, -14, -15, -16, 1, 2, 3, 4,
5297            -21, -22, -23, -24, -25, -26, -27, -28,
5298            -29, -30, -31, -32, 17, 18, 19, 20,
5299        );
5300        assert_eq_m256i(r, expected);
5301
5302        let r = _mm256_alignr_epi8::<15>(a, b);
5303        #[rustfmt::skip]
5304        let expected = _mm256_setr_epi8(
5305            -16, 1, 2, 3, 4, 5, 6, 7,
5306            8, 9, 10, 11, 12, 13, 14, 15,
5307            -32, 17, 18, 19, 20, 21, 22, 23,
5308            24, 25, 26, 27, 28, 29, 30, 31,
5309        );
5310        assert_eq_m256i(r, expected);
5311
5312        let r = _mm256_alignr_epi8::<0>(a, b);
5313        assert_eq_m256i(r, b);
5314
5315        let r = _mm256_alignr_epi8::<16>(a, b);
5316        assert_eq_m256i(r, a);
5317    }
5318
5319    #[simd_test(enable = "avx2")]
5320    unsafe fn test_mm256_shuffle_epi8() {
5321        #[rustfmt::skip]
5322        let a = _mm256_setr_epi8(
5323            1, 2, 3, 4, 5, 6, 7, 8,
5324            9, 10, 11, 12, 13, 14, 15, 16,
5325            17, 18, 19, 20, 21, 22, 23, 24,
5326            25, 26, 27, 28, 29, 30, 31, 32,
5327        );
5328        #[rustfmt::skip]
5329        let b = _mm256_setr_epi8(
5330            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5331            12, 5, 5, 10, 4, 1, 8, 0,
5332            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5333            12, 5, 5, 10, 4, 1, 8, 0,
5334        );
5335        #[rustfmt::skip]
5336        let expected = _mm256_setr_epi8(
5337            5, 0, 5, 4, 9, 13, 7, 4,
5338            13, 6, 6, 11, 5, 2, 9, 1,
5339            21, 0, 21, 20, 25, 29, 23, 20,
5340            29, 22, 22, 27, 21, 18, 25, 17,
5341        );
5342        let r = _mm256_shuffle_epi8(a, b);
5343        assert_eq_m256i(r, expected);
5344    }
5345
5346    #[simd_test(enable = "avx2")]
5347    unsafe fn test_mm256_permutevar8x32_epi32() {
5348        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5349        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5350        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5351        let r = _mm256_permutevar8x32_epi32(a, b);
5352        assert_eq_m256i(r, expected);
5353    }
5354
5355    #[simd_test(enable = "avx2")]
5356    unsafe fn test_mm256_permute4x64_epi64() {
5357        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5358        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5359        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5360        assert_eq_m256i(r, expected);
5361    }
5362
5363    #[simd_test(enable = "avx2")]
5364    unsafe fn test_mm256_permute2x128_si256() {
5365        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5366        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5367        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5368        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5369        assert_eq_m256i(r, e);
5370    }
5371
5372    #[simd_test(enable = "avx2")]
5373    unsafe fn test_mm256_permute4x64_pd() {
5374        let a = _mm256_setr_pd(1., 2., 3., 4.);
5375        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5376        let e = _mm256_setr_pd(4., 1., 2., 1.);
5377        assert_eq_m256d(r, e);
5378    }
5379
5380    #[simd_test(enable = "avx2")]
5381    unsafe fn test_mm256_permutevar8x32_ps() {
5382        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5383        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5384        let r = _mm256_permutevar8x32_ps(a, b);
5385        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5386        assert_eq_m256(r, e);
5387    }
5388
5389    #[simd_test(enable = "avx2")]
5390    unsafe fn test_mm_i32gather_epi32() {
5391        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5392        // A multiplier of 4 is word-addressing
5393        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5394        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5395    }
5396
5397    #[simd_test(enable = "avx2")]
5398    unsafe fn test_mm_mask_i32gather_epi32() {
5399        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5400        // A multiplier of 4 is word-addressing
5401        let r = _mm_mask_i32gather_epi32::<4>(
5402            _mm_set1_epi32(256),
5403            arr.as_ptr(),
5404            _mm_setr_epi32(0, 16, 64, 96),
5405            _mm_setr_epi32(-1, -1, -1, 0),
5406        );
5407        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5408    }
5409
5410    #[simd_test(enable = "avx2")]
5411    unsafe fn test_mm256_i32gather_epi32() {
5412        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5413        // A multiplier of 4 is word-addressing
5414        let r =
5415            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5416        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5417    }
5418
5419    #[simd_test(enable = "avx2")]
5420    unsafe fn test_mm256_mask_i32gather_epi32() {
5421        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5422        // A multiplier of 4 is word-addressing
5423        let r = _mm256_mask_i32gather_epi32::<4>(
5424            _mm256_set1_epi32(256),
5425            arr.as_ptr(),
5426            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5427            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5428        );
5429        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5430    }
5431
5432    #[simd_test(enable = "avx2")]
5433    unsafe fn test_mm_i32gather_ps() {
5434        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5435        // A multiplier of 4 is word-addressing for f32s
5436        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5437        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5438    }
5439
5440    #[simd_test(enable = "avx2")]
5441    unsafe fn test_mm_mask_i32gather_ps() {
5442        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5443        // A multiplier of 4 is word-addressing for f32s
5444        let r = _mm_mask_i32gather_ps::<4>(
5445            _mm_set1_ps(256.0),
5446            arr.as_ptr(),
5447            _mm_setr_epi32(0, 16, 64, 96),
5448            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5449        );
5450        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5451    }
5452
5453    #[simd_test(enable = "avx2")]
5454    unsafe fn test_mm256_i32gather_ps() {
5455        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5456        // A multiplier of 4 is word-addressing for f32s
5457        let r =
5458            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5459        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5460    }
5461
5462    #[simd_test(enable = "avx2")]
5463    unsafe fn test_mm256_mask_i32gather_ps() {
5464        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5465        // A multiplier of 4 is word-addressing for f32s
5466        let r = _mm256_mask_i32gather_ps::<4>(
5467            _mm256_set1_ps(256.0),
5468            arr.as_ptr(),
5469            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5470            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5471        );
5472        assert_eq_m256(
5473            r,
5474            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5475        );
5476    }
5477
5478    #[simd_test(enable = "avx2")]
5479    unsafe fn test_mm_i32gather_epi64() {
5480        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5481        // A multiplier of 8 is word-addressing for i64s
5482        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5483        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5484    }
5485
5486    #[simd_test(enable = "avx2")]
5487    unsafe fn test_mm_mask_i32gather_epi64() {
5488        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5489        // A multiplier of 8 is word-addressing for i64s
5490        let r = _mm_mask_i32gather_epi64::<8>(
5491            _mm_set1_epi64x(256),
5492            arr.as_ptr(),
5493            _mm_setr_epi32(16, 16, 16, 16),
5494            _mm_setr_epi64x(-1, 0),
5495        );
5496        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5497    }
5498
5499    #[simd_test(enable = "avx2")]
5500    unsafe fn test_mm256_i32gather_epi64() {
5501        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5502        // A multiplier of 8 is word-addressing for i64s
5503        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5504        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5505    }
5506
5507    #[simd_test(enable = "avx2")]
5508    unsafe fn test_mm256_mask_i32gather_epi64() {
5509        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5510        // A multiplier of 8 is word-addressing for i64s
5511        let r = _mm256_mask_i32gather_epi64::<8>(
5512            _mm256_set1_epi64x(256),
5513            arr.as_ptr(),
5514            _mm_setr_epi32(0, 16, 64, 96),
5515            _mm256_setr_epi64x(-1, -1, -1, 0),
5516        );
5517        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5518    }
5519
5520    #[simd_test(enable = "avx2")]
5521    unsafe fn test_mm_i32gather_pd() {
5522        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5523        // A multiplier of 8 is word-addressing for f64s
5524        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5525        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5526    }
5527
5528    #[simd_test(enable = "avx2")]
5529    unsafe fn test_mm_mask_i32gather_pd() {
5530        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5531        // A multiplier of 8 is word-addressing for f64s
5532        let r = _mm_mask_i32gather_pd::<8>(
5533            _mm_set1_pd(256.0),
5534            arr.as_ptr(),
5535            _mm_setr_epi32(16, 16, 16, 16),
5536            _mm_setr_pd(-1.0, 0.0),
5537        );
5538        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5539    }
5540
5541    #[simd_test(enable = "avx2")]
5542    unsafe fn test_mm256_i32gather_pd() {
5543        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5544        // A multiplier of 8 is word-addressing for f64s
5545        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5546        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5547    }
5548
5549    #[simd_test(enable = "avx2")]
5550    unsafe fn test_mm256_mask_i32gather_pd() {
5551        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5552        // A multiplier of 8 is word-addressing for f64s
5553        let r = _mm256_mask_i32gather_pd::<8>(
5554            _mm256_set1_pd(256.0),
5555            arr.as_ptr(),
5556            _mm_setr_epi32(0, 16, 64, 96),
5557            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5558        );
5559        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5560    }
5561
5562    #[simd_test(enable = "avx2")]
5563    unsafe fn test_mm_i64gather_epi32() {
5564        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5565        // A multiplier of 4 is word-addressing
5566        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5567        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5568    }
5569
5570    #[simd_test(enable = "avx2")]
5571    unsafe fn test_mm_mask_i64gather_epi32() {
5572        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5573        // A multiplier of 4 is word-addressing
5574        let r = _mm_mask_i64gather_epi32::<4>(
5575            _mm_set1_epi32(256),
5576            arr.as_ptr(),
5577            _mm_setr_epi64x(0, 16),
5578            _mm_setr_epi32(-1, 0, -1, 0),
5579        );
5580        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5581    }
5582
5583    #[simd_test(enable = "avx2")]
5584    unsafe fn test_mm256_i64gather_epi32() {
5585        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5586        // A multiplier of 4 is word-addressing
5587        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5588        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5589    }
5590
5591    #[simd_test(enable = "avx2")]
5592    unsafe fn test_mm256_mask_i64gather_epi32() {
5593        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5594        // A multiplier of 4 is word-addressing
5595        let r = _mm256_mask_i64gather_epi32::<4>(
5596            _mm_set1_epi32(256),
5597            arr.as_ptr(),
5598            _mm256_setr_epi64x(0, 16, 64, 96),
5599            _mm_setr_epi32(-1, -1, -1, 0),
5600        );
5601        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5602    }
5603
5604    #[simd_test(enable = "avx2")]
5605    unsafe fn test_mm_i64gather_ps() {
5606        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5607        // A multiplier of 4 is word-addressing for f32s
5608        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5609        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5610    }
5611
5612    #[simd_test(enable = "avx2")]
5613    unsafe fn test_mm_mask_i64gather_ps() {
5614        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5615        // A multiplier of 4 is word-addressing for f32s
5616        let r = _mm_mask_i64gather_ps::<4>(
5617            _mm_set1_ps(256.0),
5618            arr.as_ptr(),
5619            _mm_setr_epi64x(0, 16),
5620            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5621        );
5622        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5623    }
5624
5625    #[simd_test(enable = "avx2")]
5626    unsafe fn test_mm256_i64gather_ps() {
5627        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5628        // A multiplier of 4 is word-addressing for f32s
5629        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5630        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5631    }
5632
5633    #[simd_test(enable = "avx2")]
5634    unsafe fn test_mm256_mask_i64gather_ps() {
5635        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5636        // A multiplier of 4 is word-addressing for f32s
5637        let r = _mm256_mask_i64gather_ps::<4>(
5638            _mm_set1_ps(256.0),
5639            arr.as_ptr(),
5640            _mm256_setr_epi64x(0, 16, 64, 96),
5641            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5642        );
5643        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5644    }
5645
5646    #[simd_test(enable = "avx2")]
5647    unsafe fn test_mm_i64gather_epi64() {
5648        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5649        // A multiplier of 8 is word-addressing for i64s
5650        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5651        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5652    }
5653
5654    #[simd_test(enable = "avx2")]
5655    unsafe fn test_mm_mask_i64gather_epi64() {
5656        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5657        // A multiplier of 8 is word-addressing for i64s
5658        let r = _mm_mask_i64gather_epi64::<8>(
5659            _mm_set1_epi64x(256),
5660            arr.as_ptr(),
5661            _mm_setr_epi64x(16, 16),
5662            _mm_setr_epi64x(-1, 0),
5663        );
5664        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5665    }
5666
5667    #[simd_test(enable = "avx2")]
5668    unsafe fn test_mm256_i64gather_epi64() {
5669        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5670        // A multiplier of 8 is word-addressing for i64s
5671        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5672        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5673    }
5674
5675    #[simd_test(enable = "avx2")]
5676    unsafe fn test_mm256_mask_i64gather_epi64() {
5677        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5678        // A multiplier of 8 is word-addressing for i64s
5679        let r = _mm256_mask_i64gather_epi64::<8>(
5680            _mm256_set1_epi64x(256),
5681            arr.as_ptr(),
5682            _mm256_setr_epi64x(0, 16, 64, 96),
5683            _mm256_setr_epi64x(-1, -1, -1, 0),
5684        );
5685        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5686    }
5687
5688    #[simd_test(enable = "avx2")]
5689    unsafe fn test_mm_i64gather_pd() {
5690        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5691        // A multiplier of 8 is word-addressing for f64s
5692        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5693        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5694    }
5695
5696    #[simd_test(enable = "avx2")]
5697    unsafe fn test_mm_mask_i64gather_pd() {
5698        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5699        // A multiplier of 8 is word-addressing for f64s
5700        let r = _mm_mask_i64gather_pd::<8>(
5701            _mm_set1_pd(256.0),
5702            arr.as_ptr(),
5703            _mm_setr_epi64x(16, 16),
5704            _mm_setr_pd(-1.0, 0.0),
5705        );
5706        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5707    }
5708
5709    #[simd_test(enable = "avx2")]
5710    unsafe fn test_mm256_i64gather_pd() {
5711        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5712        // A multiplier of 8 is word-addressing for f64s
5713        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5714        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5715    }
5716
5717    #[simd_test(enable = "avx2")]
5718    unsafe fn test_mm256_mask_i64gather_pd() {
5719        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5720        // A multiplier of 8 is word-addressing for f64s
5721        let r = _mm256_mask_i64gather_pd::<8>(
5722            _mm256_set1_pd(256.0),
5723            arr.as_ptr(),
5724            _mm256_setr_epi64x(0, 16, 64, 96),
5725            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5726        );
5727        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5728    }
5729
5730    #[simd_test(enable = "avx")]
5731    unsafe fn test_mm256_extract_epi8() {
5732        #[rustfmt::skip]
5733        let a = _mm256_setr_epi8(
5734            -1, 1, 2, 3, 4, 5, 6, 7,
5735            8, 9, 10, 11, 12, 13, 14, 15,
5736            16, 17, 18, 19, 20, 21, 22, 23,
5737            24, 25, 26, 27, 28, 29, 30, 31
5738        );
5739        let r1 = _mm256_extract_epi8::<0>(a);
5740        let r2 = _mm256_extract_epi8::<3>(a);
5741        assert_eq!(r1, 0xFF);
5742        assert_eq!(r2, 3);
5743    }
5744
5745    #[simd_test(enable = "avx2")]
5746    unsafe fn test_mm256_extract_epi16() {
5747        #[rustfmt::skip]
5748        let a = _mm256_setr_epi16(
5749            -1, 1, 2, 3, 4, 5, 6, 7,
5750            8, 9, 10, 11, 12, 13, 14, 15,
5751        );
5752        let r1 = _mm256_extract_epi16::<0>(a);
5753        let r2 = _mm256_extract_epi16::<3>(a);
5754        assert_eq!(r1, 0xFFFF);
5755        assert_eq!(r2, 3);
5756    }
5757}