core/stdarch/crates/core_arch/src/x86/
sse41.rs

1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3use crate::core_arch::{simd::*, x86::*};
4use crate::intrinsics::simd::*;
5
6#[cfg(test)]
7use stdarch_test::assert_instr;
8
9// SSE4 rounding constants
10/// round to nearest
11#[stable(feature = "simd_x86", since = "1.27.0")]
12pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
13/// round down
14#[stable(feature = "simd_x86", since = "1.27.0")]
15pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
16/// round up
17#[stable(feature = "simd_x86", since = "1.27.0")]
18pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
19/// truncate
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
22/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
23#[stable(feature = "simd_x86", since = "1.27.0")]
24pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
25/// do not suppress exceptions
26#[stable(feature = "simd_x86", since = "1.27.0")]
27pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
28/// suppress exceptions
29#[stable(feature = "simd_x86", since = "1.27.0")]
30pub const _MM_FROUND_NO_EXC: i32 = 0x08;
31/// round to nearest and do not suppress exceptions
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub const _MM_FROUND_NINT: i32 = 0x00;
34/// round down and do not suppress exceptions
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
37/// round up and do not suppress exceptions
38#[stable(feature = "simd_x86", since = "1.27.0")]
39pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
40/// truncate and do not suppress exceptions
41#[stable(feature = "simd_x86", since = "1.27.0")]
42pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
43/// use MXCSR.RC and do not suppress exceptions; see
44/// `vendor::_MM_SET_ROUNDING_MODE`
45#[stable(feature = "simd_x86", since = "1.27.0")]
46pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
47/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
50
51/// Blend packed 8-bit integers from `a` and `b` using `mask`
52///
53/// The high bit of each corresponding mask byte determines the selection.
54/// If the high bit is set, the element of `b` is selected.
55/// Otherwise, the element of `a` is selected.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
58#[inline]
59#[target_feature(enable = "sse4.1")]
60#[cfg_attr(test, assert_instr(pblendvb))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62pub fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
63    unsafe {
64        let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
65        transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
66    }
67}
68
69/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
70///
71/// The mask bits determine the selection. A clear bit selects the
72/// corresponding element of `a`, and a set bit the corresponding
73/// element of `b`.
74///
75/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
76#[inline]
77#[target_feature(enable = "sse4.1")]
78#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
79#[rustc_legacy_const_generics(2)]
80#[stable(feature = "simd_x86", since = "1.27.0")]
81pub fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
82    static_assert_uimm_bits!(IMM8, 8);
83    unsafe {
84        transmute::<i16x8, _>(simd_shuffle!(
85            a.as_i16x8(),
86            b.as_i16x8(),
87            [
88                [0, 8][IMM8 as usize & 1],
89                [1, 9][(IMM8 >> 1) as usize & 1],
90                [2, 10][(IMM8 >> 2) as usize & 1],
91                [3, 11][(IMM8 >> 3) as usize & 1],
92                [4, 12][(IMM8 >> 4) as usize & 1],
93                [5, 13][(IMM8 >> 5) as usize & 1],
94                [6, 14][(IMM8 >> 6) as usize & 1],
95                [7, 15][(IMM8 >> 7) as usize & 1],
96            ]
97        ))
98    }
99}
100
101/// Blend packed double-precision (64-bit) floating-point elements from `a`
102/// and `b` using `mask`
103///
104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
105#[inline]
106#[target_feature(enable = "sse4.1")]
107#[cfg_attr(test, assert_instr(blendvpd))]
108#[stable(feature = "simd_x86", since = "1.27.0")]
109pub fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
110    unsafe {
111        let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
112        transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
113    }
114}
115
116/// Blend packed single-precision (32-bit) floating-point elements from `a`
117/// and `b` using `mask`
118///
119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
120#[inline]
121#[target_feature(enable = "sse4.1")]
122#[cfg_attr(test, assert_instr(blendvps))]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
125    unsafe {
126        let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
127        transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
128    }
129}
130
131/// Blend packed double-precision (64-bit) floating-point elements from `a`
132/// and `b` using control mask `IMM2`
133///
134/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
135#[inline]
136#[target_feature(enable = "sse4.1")]
137// Note: LLVM7 prefers the single-precision floating-point domain when possible
138// see https://bugs.llvm.org/show_bug.cgi?id=38195
139// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
140#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
141#[rustc_legacy_const_generics(2)]
142#[stable(feature = "simd_x86", since = "1.27.0")]
143pub fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
144    static_assert_uimm_bits!(IMM2, 2);
145    unsafe {
146        transmute::<f64x2, _>(simd_shuffle!(
147            a.as_f64x2(),
148            b.as_f64x2(),
149            [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
150        ))
151    }
152}
153
154/// Blend packed single-precision (32-bit) floating-point elements from `a`
155/// and `b` using mask `IMM4`
156///
157/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
158#[inline]
159#[target_feature(enable = "sse4.1")]
160#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
161#[rustc_legacy_const_generics(2)]
162#[stable(feature = "simd_x86", since = "1.27.0")]
163pub fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
164    static_assert_uimm_bits!(IMM4, 4);
165    unsafe {
166        transmute::<f32x4, _>(simd_shuffle!(
167            a.as_f32x4(),
168            b.as_f32x4(),
169            [
170                [0, 4][IMM4 as usize & 1],
171                [1, 5][(IMM4 >> 1) as usize & 1],
172                [2, 6][(IMM4 >> 2) as usize & 1],
173                [3, 7][(IMM4 >> 3) as usize & 1],
174            ]
175        ))
176    }
177}
178
179/// Extracts a single-precision (32-bit) floating-point element from `a`,
180/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
181/// and may be converted back to a floating point number via casting.
182///
183/// # Example
184/// ```rust
185/// # #[cfg(target_arch = "x86")]
186/// # use std::arch::x86::*;
187/// # #[cfg(target_arch = "x86_64")]
188/// # use std::arch::x86_64::*;
189/// # fn main() {
190/// #    if is_x86_feature_detected!("sse4.1") {
191/// #       #[target_feature(enable = "sse4.1")]
192/// #       #[allow(unused_unsafe)] // FIXME remove after stdarch bump in rustc
193/// #       unsafe fn worker() { unsafe {
194/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
195/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
196/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
197/// float_store.push(f32::from_bits(x as u32));
198/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
199/// #       }}
200/// #       unsafe { worker() }
201/// #   }
202/// # }
203/// ```
204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
205#[inline]
206#[target_feature(enable = "sse4.1")]
207#[cfg_attr(test, assert_instr(extractps, IMM8 = 0))]
208#[rustc_legacy_const_generics(1)]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
211    static_assert_uimm_bits!(IMM8, 2);
212    unsafe { simd_extract!(a, IMM8 as u32, f32).to_bits() as i32 }
213}
214
215/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
216/// integer containing the zero-extended integer data.
217///
218/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
219///
220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
221#[inline]
222#[target_feature(enable = "sse4.1")]
223#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
224#[rustc_legacy_const_generics(1)]
225#[stable(feature = "simd_x86", since = "1.27.0")]
226pub fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
227    static_assert_uimm_bits!(IMM8, 4);
228    unsafe { simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32 }
229}
230
231/// Extracts an 32-bit integer from `a` selected with `IMM8`
232///
233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
234#[inline]
235#[target_feature(enable = "sse4.1")]
236#[cfg_attr(test, assert_instr(extractps, IMM8 = 1))]
237#[rustc_legacy_const_generics(1)]
238#[stable(feature = "simd_x86", since = "1.27.0")]
239pub fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
240    static_assert_uimm_bits!(IMM8, 2);
241    unsafe { simd_extract!(a.as_i32x4(), IMM8 as u32, i32) }
242}
243
244/// Select a single value in `b` to store at some position in `a`,
245/// Then zero elements according to `IMM8`.
246///
247/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
248/// the result they will be copied to, and which bits in the result will be
249/// cleared. The following assignments are made:
250///
251/// * Bits `[7:6]` specify the bits to copy from operand `b`:
252///     - `00`: Selects bits `[31:0]` from operand `b`.
253///     - `01`: Selects bits `[63:32]` from operand `b`.
254///     - `10`: Selects bits `[95:64]` from operand `b`.
255///     - `11`: Selects bits `[127:96]` from operand `b`.
256///
257/// * Bits `[5:4]` specify the bits in the result to which the selected bits
258///   from operand `b` are copied:
259///     - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
260///     - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
261///     - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
262///     - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
263///
264/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
265///   element is cleared.
266///
267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
268#[inline]
269#[target_feature(enable = "sse4.1")]
270#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
271#[rustc_legacy_const_generics(2)]
272#[stable(feature = "simd_x86", since = "1.27.0")]
273pub fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
274    static_assert_uimm_bits!(IMM8, 8);
275    unsafe { insertps(a, b, IMM8 as u8) }
276}
277
278/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
279/// location specified by `IMM8`.
280///
281/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
282#[inline]
283#[target_feature(enable = "sse4.1")]
284#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
285#[rustc_legacy_const_generics(2)]
286#[stable(feature = "simd_x86", since = "1.27.0")]
287pub fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
288    static_assert_uimm_bits!(IMM8, 4);
289    unsafe { transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8)) }
290}
291
292/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
293/// location specified by `IMM8`.
294///
295/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
296#[inline]
297#[target_feature(enable = "sse4.1")]
298#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
299#[rustc_legacy_const_generics(2)]
300#[stable(feature = "simd_x86", since = "1.27.0")]
301pub fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
302    static_assert_uimm_bits!(IMM8, 2);
303    unsafe { transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i)) }
304}
305
306/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
307/// values in dst.
308///
309/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
310#[inline]
311#[target_feature(enable = "sse4.1")]
312#[cfg_attr(test, assert_instr(pmaxsb))]
313#[stable(feature = "simd_x86", since = "1.27.0")]
314pub fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
315    unsafe {
316        let a = a.as_i8x16();
317        let b = b.as_i8x16();
318        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
319    }
320}
321
322/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
323/// maximum.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
326#[inline]
327#[target_feature(enable = "sse4.1")]
328#[cfg_attr(test, assert_instr(pmaxuw))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
331    unsafe {
332        let a = a.as_u16x8();
333        let b = b.as_u16x8();
334        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
335    }
336}
337
338/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
339/// values.
340///
341/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
342#[inline]
343#[target_feature(enable = "sse4.1")]
344#[cfg_attr(test, assert_instr(pmaxsd))]
345#[stable(feature = "simd_x86", since = "1.27.0")]
346pub fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
347    unsafe {
348        let a = a.as_i32x4();
349        let b = b.as_i32x4();
350        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
351    }
352}
353
354/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
355/// maximum values.
356///
357/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
358#[inline]
359#[target_feature(enable = "sse4.1")]
360#[cfg_attr(test, assert_instr(pmaxud))]
361#[stable(feature = "simd_x86", since = "1.27.0")]
362pub fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
363    unsafe {
364        let a = a.as_u32x4();
365        let b = b.as_u32x4();
366        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
367    }
368}
369
370/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
371/// values in dst.
372///
373/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
374#[inline]
375#[target_feature(enable = "sse4.1")]
376#[cfg_attr(test, assert_instr(pminsb))]
377#[stable(feature = "simd_x86", since = "1.27.0")]
378pub fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
379    unsafe {
380        let a = a.as_i8x16();
381        let b = b.as_i8x16();
382        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
383    }
384}
385
386/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
387/// minimum.
388///
389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
390#[inline]
391#[target_feature(enable = "sse4.1")]
392#[cfg_attr(test, assert_instr(pminuw))]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
395    unsafe {
396        let a = a.as_u16x8();
397        let b = b.as_u16x8();
398        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
399    }
400}
401
402/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
403/// values.
404///
405/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
406#[inline]
407#[target_feature(enable = "sse4.1")]
408#[cfg_attr(test, assert_instr(pminsd))]
409#[stable(feature = "simd_x86", since = "1.27.0")]
410pub fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
411    unsafe {
412        let a = a.as_i32x4();
413        let b = b.as_i32x4();
414        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
415    }
416}
417
418/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
419/// minimum values.
420///
421/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
422#[inline]
423#[target_feature(enable = "sse4.1")]
424#[cfg_attr(test, assert_instr(pminud))]
425#[stable(feature = "simd_x86", since = "1.27.0")]
426pub fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
427    unsafe {
428        let a = a.as_u32x4();
429        let b = b.as_u32x4();
430        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
431    }
432}
433
434/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
435/// using unsigned saturation
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
438#[inline]
439#[target_feature(enable = "sse4.1")]
440#[cfg_attr(test, assert_instr(packusdw))]
441#[stable(feature = "simd_x86", since = "1.27.0")]
442pub fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
443    unsafe { transmute(packusdw(a.as_i32x4(), b.as_i32x4())) }
444}
445
446/// Compares packed 64-bit integers in `a` and `b` for equality
447///
448/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
449#[inline]
450#[target_feature(enable = "sse4.1")]
451#[cfg_attr(test, assert_instr(pcmpeqq))]
452#[stable(feature = "simd_x86", since = "1.27.0")]
453pub fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
454    unsafe { transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) }
455}
456
457/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
458///
459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
460#[inline]
461#[target_feature(enable = "sse4.1")]
462#[cfg_attr(test, assert_instr(pmovsxbw))]
463#[stable(feature = "simd_x86", since = "1.27.0")]
464pub fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
465    unsafe {
466        let a = a.as_i8x16();
467        let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
468        transmute(simd_cast::<_, i16x8>(a))
469    }
470}
471
472/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
473///
474/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
475#[inline]
476#[target_feature(enable = "sse4.1")]
477#[cfg_attr(test, assert_instr(pmovsxbd))]
478#[stable(feature = "simd_x86", since = "1.27.0")]
479pub fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
480    unsafe {
481        let a = a.as_i8x16();
482        let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
483        transmute(simd_cast::<_, i32x4>(a))
484    }
485}
486
487/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
488/// 64-bit integers
489///
490/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
491#[inline]
492#[target_feature(enable = "sse4.1")]
493#[cfg_attr(test, assert_instr(pmovsxbq))]
494#[stable(feature = "simd_x86", since = "1.27.0")]
495pub fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
496    unsafe {
497        let a = a.as_i8x16();
498        let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
499        transmute(simd_cast::<_, i64x2>(a))
500    }
501}
502
503/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
504///
505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
506#[inline]
507#[target_feature(enable = "sse4.1")]
508#[cfg_attr(test, assert_instr(pmovsxwd))]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510pub fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
511    unsafe {
512        let a = a.as_i16x8();
513        let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
514        transmute(simd_cast::<_, i32x4>(a))
515    }
516}
517
518/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
519///
520/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
521#[inline]
522#[target_feature(enable = "sse4.1")]
523#[cfg_attr(test, assert_instr(pmovsxwq))]
524#[stable(feature = "simd_x86", since = "1.27.0")]
525pub fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
526    unsafe {
527        let a = a.as_i16x8();
528        let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
529        transmute(simd_cast::<_, i64x2>(a))
530    }
531}
532
533/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
534///
535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
536#[inline]
537#[target_feature(enable = "sse4.1")]
538#[cfg_attr(test, assert_instr(pmovsxdq))]
539#[stable(feature = "simd_x86", since = "1.27.0")]
540pub fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
541    unsafe {
542        let a = a.as_i32x4();
543        let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
544        transmute(simd_cast::<_, i64x2>(a))
545    }
546}
547
548/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
549///
550/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
551#[inline]
552#[target_feature(enable = "sse4.1")]
553#[cfg_attr(test, assert_instr(pmovzxbw))]
554#[stable(feature = "simd_x86", since = "1.27.0")]
555pub fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
556    unsafe {
557        let a = a.as_u8x16();
558        let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
559        transmute(simd_cast::<_, i16x8>(a))
560    }
561}
562
563/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
564///
565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
566#[inline]
567#[target_feature(enable = "sse4.1")]
568#[cfg_attr(test, assert_instr(pmovzxbd))]
569#[stable(feature = "simd_x86", since = "1.27.0")]
570pub fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
571    unsafe {
572        let a = a.as_u8x16();
573        let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
574        transmute(simd_cast::<_, i32x4>(a))
575    }
576}
577
578/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
579///
580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
581#[inline]
582#[target_feature(enable = "sse4.1")]
583#[cfg_attr(test, assert_instr(pmovzxbq))]
584#[stable(feature = "simd_x86", since = "1.27.0")]
585pub fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
586    unsafe {
587        let a = a.as_u8x16();
588        let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
589        transmute(simd_cast::<_, i64x2>(a))
590    }
591}
592
593/// Zeroes extend packed unsigned 16-bit integers in `a`
594/// to packed 32-bit integers
595///
596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
597#[inline]
598#[target_feature(enable = "sse4.1")]
599#[cfg_attr(test, assert_instr(pmovzxwd))]
600#[stable(feature = "simd_x86", since = "1.27.0")]
601pub fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
602    unsafe {
603        let a = a.as_u16x8();
604        let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
605        transmute(simd_cast::<_, i32x4>(a))
606    }
607}
608
609/// Zeroes extend packed unsigned 16-bit integers in `a`
610/// to packed 64-bit integers
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
613#[inline]
614#[target_feature(enable = "sse4.1")]
615#[cfg_attr(test, assert_instr(pmovzxwq))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617pub fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
618    unsafe {
619        let a = a.as_u16x8();
620        let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
621        transmute(simd_cast::<_, i64x2>(a))
622    }
623}
624
625/// Zeroes extend packed unsigned 32-bit integers in `a`
626/// to packed 64-bit integers
627///
628/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
629#[inline]
630#[target_feature(enable = "sse4.1")]
631#[cfg_attr(test, assert_instr(pmovzxdq))]
632#[stable(feature = "simd_x86", since = "1.27.0")]
633pub fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
634    unsafe {
635        let a = a.as_u32x4();
636        let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
637        transmute(simd_cast::<_, i64x2>(a))
638    }
639}
640
641/// Returns the dot product of two __m128d vectors.
642///
643/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
644/// If a condition mask bit is zero, the corresponding multiplication is
645/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
646/// the dot product will be stored in the return value component. Otherwise if
647/// the broadcast mask bit is zero then the return component will be zero.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
650#[inline]
651#[target_feature(enable = "sse4.1")]
652#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
653#[rustc_legacy_const_generics(2)]
654#[stable(feature = "simd_x86", since = "1.27.0")]
655pub fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
656    unsafe {
657        static_assert_uimm_bits!(IMM8, 8);
658        dppd(a, b, IMM8 as u8)
659    }
660}
661
662/// Returns the dot product of two __m128 vectors.
663///
664/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
665/// If a condition mask bit is zero, the corresponding multiplication is
666/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
667/// the dot product will be stored in the return value component. Otherwise if
668/// the broadcast mask bit is zero then the return component will be zero.
669///
670/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
671#[inline]
672#[target_feature(enable = "sse4.1")]
673#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
674#[rustc_legacy_const_generics(2)]
675#[stable(feature = "simd_x86", since = "1.27.0")]
676pub fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
677    static_assert_uimm_bits!(IMM8, 8);
678    unsafe { dpps(a, b, IMM8 as u8) }
679}
680
681/// Round the packed double-precision (64-bit) floating-point elements in `a`
682/// down to an integer value, and stores the results as packed double-precision
683/// floating-point elements.
684///
685/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
686#[inline]
687#[target_feature(enable = "sse4.1")]
688#[cfg_attr(test, assert_instr(roundpd))]
689#[stable(feature = "simd_x86", since = "1.27.0")]
690pub fn _mm_floor_pd(a: __m128d) -> __m128d {
691    unsafe { simd_floor(a) }
692}
693
694/// Round the packed single-precision (32-bit) floating-point elements in `a`
695/// down to an integer value, and stores the results as packed single-precision
696/// floating-point elements.
697///
698/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
699#[inline]
700#[target_feature(enable = "sse4.1")]
701#[cfg_attr(test, assert_instr(roundps))]
702#[stable(feature = "simd_x86", since = "1.27.0")]
703pub fn _mm_floor_ps(a: __m128) -> __m128 {
704    unsafe { simd_floor(a) }
705}
706
707/// Round the lower double-precision (64-bit) floating-point element in `b`
708/// down to an integer value, store the result as a double-precision
709/// floating-point element in the lower element of the intrinsic result,
710/// and copies the upper element from `a` to the upper element of the intrinsic
711/// result.
712///
713/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
714#[inline]
715#[target_feature(enable = "sse4.1")]
716#[cfg_attr(test, assert_instr(roundsd))]
717#[stable(feature = "simd_x86", since = "1.27.0")]
718pub fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
719    unsafe { roundsd(a, b, _MM_FROUND_FLOOR) }
720}
721
722/// Round the lower single-precision (32-bit) floating-point element in `b`
723/// down to an integer value, store the result as a single-precision
724/// floating-point element in the lower element of the intrinsic result,
725/// and copies the upper 3 packed elements from `a` to the upper elements
726/// of the intrinsic result.
727///
728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
729#[inline]
730#[target_feature(enable = "sse4.1")]
731#[cfg_attr(test, assert_instr(roundss))]
732#[stable(feature = "simd_x86", since = "1.27.0")]
733pub fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
734    unsafe { roundss(a, b, _MM_FROUND_FLOOR) }
735}
736
737/// Round the packed double-precision (64-bit) floating-point elements in `a`
738/// up to an integer value, and stores the results as packed double-precision
739/// floating-point elements.
740///
741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
742#[inline]
743#[target_feature(enable = "sse4.1")]
744#[cfg_attr(test, assert_instr(roundpd))]
745#[stable(feature = "simd_x86", since = "1.27.0")]
746pub fn _mm_ceil_pd(a: __m128d) -> __m128d {
747    unsafe { simd_ceil(a) }
748}
749
750/// Round the packed single-precision (32-bit) floating-point elements in `a`
751/// up to an integer value, and stores the results as packed single-precision
752/// floating-point elements.
753///
754/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
755#[inline]
756#[target_feature(enable = "sse4.1")]
757#[cfg_attr(test, assert_instr(roundps))]
758#[stable(feature = "simd_x86", since = "1.27.0")]
759pub fn _mm_ceil_ps(a: __m128) -> __m128 {
760    unsafe { simd_ceil(a) }
761}
762
763/// Round the lower double-precision (64-bit) floating-point element in `b`
764/// up to an integer value, store the result as a double-precision
765/// floating-point element in the lower element of the intrinsic result,
766/// and copies the upper element from `a` to the upper element
767/// of the intrinsic result.
768///
769/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
770#[inline]
771#[target_feature(enable = "sse4.1")]
772#[cfg_attr(test, assert_instr(roundsd))]
773#[stable(feature = "simd_x86", since = "1.27.0")]
774pub fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
775    unsafe { roundsd(a, b, _MM_FROUND_CEIL) }
776}
777
778/// Round the lower single-precision (32-bit) floating-point element in `b`
779/// up to an integer value, store the result as a single-precision
780/// floating-point element in the lower element of the intrinsic result,
781/// and copies the upper 3 packed elements from `a` to the upper elements
782/// of the intrinsic result.
783///
784/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
785#[inline]
786#[target_feature(enable = "sse4.1")]
787#[cfg_attr(test, assert_instr(roundss))]
788#[stable(feature = "simd_x86", since = "1.27.0")]
789pub fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
790    unsafe { roundss(a, b, _MM_FROUND_CEIL) }
791}
792
793/// Round the packed double-precision (64-bit) floating-point elements in `a`
794/// using the `ROUNDING` parameter, and stores the results as packed
795/// double-precision floating-point elements.
796/// Rounding is done according to the rounding parameter, which can be one of:
797///
798/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
799/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
800/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
801/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
802/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
803///
804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
805#[inline]
806#[target_feature(enable = "sse4.1")]
807#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
808#[rustc_legacy_const_generics(1)]
809#[stable(feature = "simd_x86", since = "1.27.0")]
810pub fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
811    static_assert_uimm_bits!(ROUNDING, 4);
812    unsafe { roundpd(a, ROUNDING) }
813}
814
815/// Round the packed single-precision (32-bit) floating-point elements in `a`
816/// using the `ROUNDING` parameter, and stores the results as packed
817/// single-precision floating-point elements.
818/// Rounding is done according to the rounding parameter, which can be one of:
819///
820/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
821/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
822/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
823/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
824/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
825///
826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
827#[inline]
828#[target_feature(enable = "sse4.1")]
829#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
830#[rustc_legacy_const_generics(1)]
831#[stable(feature = "simd_x86", since = "1.27.0")]
832pub fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
833    static_assert_uimm_bits!(ROUNDING, 4);
834    unsafe { roundps(a, ROUNDING) }
835}
836
837/// Round the lower double-precision (64-bit) floating-point element in `b`
838/// using the `ROUNDING` parameter, store the result as a double-precision
839/// floating-point element in the lower element of the intrinsic result,
840/// and copies the upper element from `a` to the upper element of the intrinsic
841/// result.
842/// Rounding is done according to the rounding parameter, which can be one of:
843///
844/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
845/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
846/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
847/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
848/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
851#[inline]
852#[target_feature(enable = "sse4.1")]
853#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
854#[rustc_legacy_const_generics(2)]
855#[stable(feature = "simd_x86", since = "1.27.0")]
856pub fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
857    static_assert_uimm_bits!(ROUNDING, 4);
858    unsafe { roundsd(a, b, ROUNDING) }
859}
860
861/// Round the lower single-precision (32-bit) floating-point element in `b`
862/// using the `ROUNDING` parameter, store the result as a single-precision
863/// floating-point element in the lower element of the intrinsic result,
864/// and copies the upper 3 packed elements from `a` to the upper elements
865/// of the intrinsic result.
866/// Rounding is done according to the rounding parameter, which can be one of:
867///
868/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
869/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
870/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
871/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
872/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
873///
874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
875#[inline]
876#[target_feature(enable = "sse4.1")]
877#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
878#[rustc_legacy_const_generics(2)]
879#[stable(feature = "simd_x86", since = "1.27.0")]
880pub fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
881    static_assert_uimm_bits!(ROUNDING, 4);
882    unsafe { roundss(a, b, ROUNDING) }
883}
884
885/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
886/// returning a vector containing its value in its first position, and its
887/// index
888/// in its second position; all other elements are set to zero.
889///
890/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
891/// instruction.
892///
893/// Arguments:
894///
895/// * `a` - A 128-bit vector of type `__m128i`.
896///
897/// Returns:
898///
899/// A 128-bit value where:
900///
901/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
902/// * bits `[18:16]` - contain the index of the minimum value
903/// * remaining bits are set to `0`.
904///
905/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
906#[inline]
907#[target_feature(enable = "sse4.1")]
908#[cfg_attr(test, assert_instr(phminposuw))]
909#[stable(feature = "simd_x86", since = "1.27.0")]
910pub fn _mm_minpos_epu16(a: __m128i) -> __m128i {
911    unsafe { transmute(phminposuw(a.as_u16x8())) }
912}
913
914/// Multiplies the low 32-bit integers from each packed 64-bit
915/// element in `a` and `b`, and returns the signed 64-bit result.
916///
917/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
918#[inline]
919#[target_feature(enable = "sse4.1")]
920#[cfg_attr(test, assert_instr(pmuldq))]
921#[stable(feature = "simd_x86", since = "1.27.0")]
922pub fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
923    unsafe {
924        let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
925        let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
926        transmute(simd_mul(a, b))
927    }
928}
929
930/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
931/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
932/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
933/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
934/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
935/// return a negative number.
936///
937/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
938#[inline]
939#[target_feature(enable = "sse4.1")]
940#[cfg_attr(test, assert_instr(pmulld))]
941#[stable(feature = "simd_x86", since = "1.27.0")]
942pub fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
943    unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
944}
945
946/// Subtracts 8-bit unsigned integer values and computes the absolute
947/// values of the differences to the corresponding bits in the destination.
948/// Then sums of the absolute differences are returned according to the bit
949/// fields in the immediate operand.
950///
951/// The following algorithm is performed:
952///
953/// ```ignore
954/// i = IMM8[2] * 4
955/// j = IMM8[1:0] * 4
956/// for k := 0 to 7
957///     d0 = abs(a[i + k + 0] - b[j + 0])
958///     d1 = abs(a[i + k + 1] - b[j + 1])
959///     d2 = abs(a[i + k + 2] - b[j + 2])
960///     d3 = abs(a[i + k + 3] - b[j + 3])
961///     r[k] = d0 + d1 + d2 + d3
962/// ```
963///
964/// Arguments:
965///
966/// * `a` - A 128-bit vector of type `__m128i`.
967/// * `b` - A 128-bit vector of type `__m128i`.
968/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
969///   differences are to be calculated
970///     * Bit `[2]` specify the offset for operand `a`
971///     * Bits `[1:0]` specify the offset for operand `b`
972///
973/// Returns:
974///
975/// * A `__m128i` vector containing the sums of the sets of   absolute
976///   differences between both operands.
977///
978/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
979#[inline]
980#[target_feature(enable = "sse4.1")]
981#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
982#[rustc_legacy_const_generics(2)]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984pub fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
985    static_assert_uimm_bits!(IMM8, 3);
986    unsafe { transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8)) }
987}
988
989/// Tests whether the specified bits in a 128-bit integer vector are all
990/// zeros.
991///
992/// Arguments:
993///
994/// * `a` - A 128-bit integer vector containing the bits to be tested.
995/// * `mask` - A 128-bit integer vector selecting which bits to test in
996///   operand `a`.
997///
998/// Returns:
999///
1000/// * `1` - if the specified bits are all zeros,
1001/// * `0` - otherwise.
1002///
1003/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
1004#[inline]
1005#[target_feature(enable = "sse4.1")]
1006#[cfg_attr(test, assert_instr(ptest))]
1007#[stable(feature = "simd_x86", since = "1.27.0")]
1008pub fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1009    unsafe {
1010        let r = simd_reduce_or(simd_and(a.as_i64x2(), mask.as_i64x2()));
1011        (0i64 == r) as i32
1012    }
1013}
1014
1015/// Tests whether the specified bits in a 128-bit integer vector are all
1016/// ones.
1017///
1018/// Arguments:
1019///
1020/// * `a` - A 128-bit integer vector containing the bits to be tested.
1021/// * `mask` - A 128-bit integer vector selecting which bits to test in
1022///   operand `a`.
1023///
1024/// Returns:
1025///
1026/// * `1` - if the specified bits are all ones,
1027/// * `0` - otherwise.
1028///
1029/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
1030#[inline]
1031#[target_feature(enable = "sse4.1")]
1032#[cfg_attr(test, assert_instr(ptest))]
1033#[stable(feature = "simd_x86", since = "1.27.0")]
1034pub fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1035    unsafe {
1036        let r = simd_reduce_or(simd_and(
1037            simd_xor(a.as_i64x2(), i64x2::splat(!0)),
1038            mask.as_i64x2(),
1039        ));
1040        (0i64 == r) as i32
1041    }
1042}
1043
1044/// Tests whether the specified bits in a 128-bit integer vector are
1045/// neither all zeros nor all ones.
1046///
1047/// Arguments:
1048///
1049/// * `a` - A 128-bit integer vector containing the bits to be tested.
1050/// * `mask` - A 128-bit integer vector selecting which bits to test in
1051///   operand `a`.
1052///
1053/// Returns:
1054///
1055/// * `1` - if the specified bits are neither all zeros nor all ones,
1056/// * `0` - otherwise.
1057///
1058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
1059#[inline]
1060#[target_feature(enable = "sse4.1")]
1061#[cfg_attr(test, assert_instr(ptest))]
1062#[stable(feature = "simd_x86", since = "1.27.0")]
1063pub fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1064    unsafe { ptestnzc(a.as_i64x2(), mask.as_i64x2()) }
1065}
1066
1067/// Tests whether the specified bits in a 128-bit integer vector are all
1068/// zeros.
1069///
1070/// Arguments:
1071///
1072/// * `a` - A 128-bit integer vector containing the bits to be tested.
1073/// * `mask` - A 128-bit integer vector selecting which bits to test in
1074///   operand `a`.
1075///
1076/// Returns:
1077///
1078/// * `1` - if the specified bits are all zeros,
1079/// * `0` - otherwise.
1080///
1081/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1082#[inline]
1083#[target_feature(enable = "sse4.1")]
1084#[cfg_attr(test, assert_instr(ptest))]
1085#[stable(feature = "simd_x86", since = "1.27.0")]
1086pub fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1087    _mm_testz_si128(a, mask)
1088}
1089
1090/// Tests whether the specified bits in `a` 128-bit integer vector are all
1091/// ones.
1092///
1093/// Argument:
1094///
1095/// * `a` - A 128-bit integer vector containing the bits to be tested.
1096///
1097/// Returns:
1098///
1099/// * `1` - if the bits specified in the operand are all set to 1,
1100/// * `0` - otherwise.
1101///
1102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1103#[inline]
1104#[target_feature(enable = "sse4.1")]
1105#[cfg_attr(test, assert_instr(pcmpeqd))]
1106#[cfg_attr(test, assert_instr(ptest))]
1107#[stable(feature = "simd_x86", since = "1.27.0")]
1108pub fn _mm_test_all_ones(a: __m128i) -> i32 {
1109    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1110}
1111
1112/// Tests whether the specified bits in a 128-bit integer vector are
1113/// neither all zeros nor all ones.
1114///
1115/// Arguments:
1116///
1117/// * `a` - A 128-bit integer vector containing the bits to be tested.
1118/// * `mask` - A 128-bit integer vector selecting which bits to test in
1119///   operand `a`.
1120///
1121/// Returns:
1122///
1123/// * `1` - if the specified bits are neither all zeros nor all ones,
1124/// * `0` - otherwise.
1125///
1126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1127#[inline]
1128#[target_feature(enable = "sse4.1")]
1129#[cfg_attr(test, assert_instr(ptest))]
1130#[stable(feature = "simd_x86", since = "1.27.0")]
1131pub fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1132    _mm_testnzc_si128(a, mask)
1133}
1134
1135/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
1136/// boundary or a general-protection exception may be generated. To minimize caching, the data
1137/// is flagged as non-temporal (unlikely to be used again soon)
1138///
1139/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
1140#[inline]
1141#[target_feature(enable = "sse4.1")]
1142#[cfg_attr(test, assert_instr(movntdqa))]
1143#[stable(feature = "simd_x86_updates", since = "1.82.0")]
1144pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
1145    let dst: __m128i;
1146    crate::arch::asm!(
1147        vpl!("movntdqa {a}"),
1148        a = out(xmm_reg) dst,
1149        p = in(reg) mem_addr,
1150        options(pure, readonly, nostack, preserves_flags),
1151    );
1152    dst
1153}
1154
1155#[allow(improper_ctypes)]
1156unsafe extern "C" {
1157    #[link_name = "llvm.x86.sse41.insertps"]
1158    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1159    #[link_name = "llvm.x86.sse41.packusdw"]
1160    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1161    #[link_name = "llvm.x86.sse41.dppd"]
1162    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1163    #[link_name = "llvm.x86.sse41.dpps"]
1164    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1165    #[link_name = "llvm.x86.sse41.round.pd"]
1166    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1167    #[link_name = "llvm.x86.sse41.round.ps"]
1168    fn roundps(a: __m128, rounding: i32) -> __m128;
1169    #[link_name = "llvm.x86.sse41.round.sd"]
1170    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1171    #[link_name = "llvm.x86.sse41.round.ss"]
1172    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1173    #[link_name = "llvm.x86.sse41.phminposuw"]
1174    fn phminposuw(a: u16x8) -> u16x8;
1175    #[link_name = "llvm.x86.sse41.mpsadbw"]
1176    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1177    #[link_name = "llvm.x86.sse41.ptestnzc"]
1178    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1179}
1180
1181#[cfg(test)]
1182mod tests {
1183    use crate::core_arch::x86::*;
1184    use std::mem;
1185    use stdarch_test::simd_test;
1186
1187    #[simd_test(enable = "sse4.1")]
1188    unsafe fn test_mm_blendv_epi8() {
1189        #[rustfmt::skip]
1190        let a = _mm_setr_epi8(
1191            0, 1, 2, 3, 4, 5, 6, 7,
1192            8, 9, 10, 11, 12, 13, 14, 15,
1193        );
1194        #[rustfmt::skip]
1195        let b = _mm_setr_epi8(
1196            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1197        );
1198        #[rustfmt::skip]
1199        let mask = _mm_setr_epi8(
1200            0, -1, 0, -1, 0, -1, 0, -1,
1201            0, -1, 0, -1, 0, -1, 0, -1,
1202        );
1203        #[rustfmt::skip]
1204        let e = _mm_setr_epi8(
1205            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1206        );
1207        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1208    }
1209
1210    #[simd_test(enable = "sse4.1")]
1211    unsafe fn test_mm_blendv_pd() {
1212        let a = _mm_set1_pd(0.0);
1213        let b = _mm_set1_pd(1.0);
1214        let mask = transmute(_mm_setr_epi64x(0, -1));
1215        let r = _mm_blendv_pd(a, b, mask);
1216        let e = _mm_setr_pd(0.0, 1.0);
1217        assert_eq_m128d(r, e);
1218    }
1219
1220    #[simd_test(enable = "sse4.1")]
1221    unsafe fn test_mm_blendv_ps() {
1222        let a = _mm_set1_ps(0.0);
1223        let b = _mm_set1_ps(1.0);
1224        let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1225        let r = _mm_blendv_ps(a, b, mask);
1226        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1227        assert_eq_m128(r, e);
1228    }
1229
1230    #[simd_test(enable = "sse4.1")]
1231    unsafe fn test_mm_blend_pd() {
1232        let a = _mm_set1_pd(0.0);
1233        let b = _mm_set1_pd(1.0);
1234        let r = _mm_blend_pd::<0b10>(a, b);
1235        let e = _mm_setr_pd(0.0, 1.0);
1236        assert_eq_m128d(r, e);
1237    }
1238
1239    #[simd_test(enable = "sse4.1")]
1240    unsafe fn test_mm_blend_ps() {
1241        let a = _mm_set1_ps(0.0);
1242        let b = _mm_set1_ps(1.0);
1243        let r = _mm_blend_ps::<0b1010>(a, b);
1244        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1245        assert_eq_m128(r, e);
1246    }
1247
1248    #[simd_test(enable = "sse4.1")]
1249    unsafe fn test_mm_blend_epi16() {
1250        let a = _mm_set1_epi16(0);
1251        let b = _mm_set1_epi16(1);
1252        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1253        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1254        assert_eq_m128i(r, e);
1255    }
1256
1257    #[simd_test(enable = "sse4.1")]
1258    unsafe fn test_mm_extract_ps() {
1259        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1260        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
1261        assert_eq!(r, 1.0);
1262        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
1263        assert_eq!(r, 3.0);
1264    }
1265
1266    #[simd_test(enable = "sse4.1")]
1267    unsafe fn test_mm_extract_epi8() {
1268        #[rustfmt::skip]
1269        let a = _mm_setr_epi8(
1270            -1, 1, 2, 3, 4, 5, 6, 7,
1271            8, 9, 10, 11, 12, 13, 14, 15
1272        );
1273        let r1 = _mm_extract_epi8::<0>(a);
1274        let r2 = _mm_extract_epi8::<3>(a);
1275        assert_eq!(r1, 0xFF);
1276        assert_eq!(r2, 3);
1277    }
1278
1279    #[simd_test(enable = "sse4.1")]
1280    unsafe fn test_mm_extract_epi32() {
1281        let a = _mm_setr_epi32(0, 1, 2, 3);
1282        let r = _mm_extract_epi32::<1>(a);
1283        assert_eq!(r, 1);
1284        let r = _mm_extract_epi32::<3>(a);
1285        assert_eq!(r, 3);
1286    }
1287
1288    #[simd_test(enable = "sse4.1")]
1289    unsafe fn test_mm_insert_ps() {
1290        let a = _mm_set1_ps(1.0);
1291        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1292        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1293        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1294        assert_eq_m128(r, e);
1295
1296        // Zeroing takes precedence over copied value
1297        let a = _mm_set1_ps(1.0);
1298        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1299        let r = _mm_insert_ps::<0b11_00_0001>(a, b);
1300        let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
1301        assert_eq_m128(r, e);
1302    }
1303
1304    #[simd_test(enable = "sse4.1")]
1305    unsafe fn test_mm_insert_epi8() {
1306        let a = _mm_set1_epi8(0);
1307        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1308        let r = _mm_insert_epi8::<1>(a, 32);
1309        assert_eq_m128i(r, e);
1310        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1311        let r = _mm_insert_epi8::<14>(a, 32);
1312        assert_eq_m128i(r, e);
1313    }
1314
1315    #[simd_test(enable = "sse4.1")]
1316    unsafe fn test_mm_insert_epi32() {
1317        let a = _mm_set1_epi32(0);
1318        let e = _mm_setr_epi32(0, 32, 0, 0);
1319        let r = _mm_insert_epi32::<1>(a, 32);
1320        assert_eq_m128i(r, e);
1321        let e = _mm_setr_epi32(0, 0, 0, 32);
1322        let r = _mm_insert_epi32::<3>(a, 32);
1323        assert_eq_m128i(r, e);
1324    }
1325
1326    #[simd_test(enable = "sse4.1")]
1327    unsafe fn test_mm_max_epi8() {
1328        #[rustfmt::skip]
1329        let a = _mm_setr_epi8(
1330            1, 4, 5, 8, 9, 12, 13, 16,
1331            17, 20, 21, 24, 25, 28, 29, 32,
1332        );
1333        #[rustfmt::skip]
1334        let b = _mm_setr_epi8(
1335            2, 3, 6, 7, 10, 11, 14, 15,
1336            18, 19, 22, 23, 26, 27, 30, 31,
1337        );
1338        let r = _mm_max_epi8(a, b);
1339        #[rustfmt::skip]
1340        let e = _mm_setr_epi8(
1341            2, 4, 6, 8, 10, 12, 14, 16,
1342            18, 20, 22, 24, 26, 28, 30, 32,
1343        );
1344        assert_eq_m128i(r, e);
1345    }
1346
1347    #[simd_test(enable = "sse4.1")]
1348    unsafe fn test_mm_max_epu16() {
1349        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1350        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1351        let r = _mm_max_epu16(a, b);
1352        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1353        assert_eq_m128i(r, e);
1354    }
1355
1356    #[simd_test(enable = "sse4.1")]
1357    unsafe fn test_mm_max_epi32() {
1358        let a = _mm_setr_epi32(1, 4, 5, 8);
1359        let b = _mm_setr_epi32(2, 3, 6, 7);
1360        let r = _mm_max_epi32(a, b);
1361        let e = _mm_setr_epi32(2, 4, 6, 8);
1362        assert_eq_m128i(r, e);
1363    }
1364
1365    #[simd_test(enable = "sse4.1")]
1366    unsafe fn test_mm_max_epu32() {
1367        let a = _mm_setr_epi32(1, 4, 5, 8);
1368        let b = _mm_setr_epi32(2, 3, 6, 7);
1369        let r = _mm_max_epu32(a, b);
1370        let e = _mm_setr_epi32(2, 4, 6, 8);
1371        assert_eq_m128i(r, e);
1372    }
1373
1374    #[simd_test(enable = "sse4.1")]
1375    unsafe fn test_mm_min_epi8() {
1376        #[rustfmt::skip]
1377        let a = _mm_setr_epi8(
1378            1, 4, 5, 8, 9, 12, 13, 16,
1379            17, 20, 21, 24, 25, 28, 29, 32,
1380        );
1381        #[rustfmt::skip]
1382        let b = _mm_setr_epi8(
1383            2, 3, 6, 7, 10, 11, 14, 15,
1384            18, 19, 22, 23, 26, 27, 30, 31,
1385        );
1386        let r = _mm_min_epi8(a, b);
1387        #[rustfmt::skip]
1388        let e = _mm_setr_epi8(
1389            1, 3, 5, 7, 9, 11, 13, 15,
1390            17, 19, 21, 23, 25, 27, 29, 31,
1391        );
1392        assert_eq_m128i(r, e);
1393
1394        #[rustfmt::skip]
1395        let a = _mm_setr_epi8(
1396            1, -4, -5, 8, -9, -12, 13, -16,
1397            17, 20, 21, 24, 25, 28, 29, 32,
1398        );
1399        #[rustfmt::skip]
1400        let b = _mm_setr_epi8(
1401            2, -3, -6, 7, -10, -11, 14, -15,
1402            18, 19, 22, 23, 26, 27, 30, 31,
1403        );
1404        let r = _mm_min_epi8(a, b);
1405        #[rustfmt::skip]
1406        let e = _mm_setr_epi8(
1407            1, -4, -6, 7, -10, -12, 13, -16,
1408            17, 19, 21, 23, 25, 27, 29, 31,
1409        );
1410        assert_eq_m128i(r, e);
1411    }
1412
1413    #[simd_test(enable = "sse4.1")]
1414    unsafe fn test_mm_min_epu16() {
1415        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1416        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1417        let r = _mm_min_epu16(a, b);
1418        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1419        assert_eq_m128i(r, e);
1420    }
1421
1422    #[simd_test(enable = "sse4.1")]
1423    unsafe fn test_mm_min_epi32() {
1424        let a = _mm_setr_epi32(1, 4, 5, 8);
1425        let b = _mm_setr_epi32(2, 3, 6, 7);
1426        let r = _mm_min_epi32(a, b);
1427        let e = _mm_setr_epi32(1, 3, 5, 7);
1428        assert_eq_m128i(r, e);
1429
1430        let a = _mm_setr_epi32(-1, 4, 5, -7);
1431        let b = _mm_setr_epi32(-2, 3, -6, 8);
1432        let r = _mm_min_epi32(a, b);
1433        let e = _mm_setr_epi32(-2, 3, -6, -7);
1434        assert_eq_m128i(r, e);
1435    }
1436
1437    #[simd_test(enable = "sse4.1")]
1438    unsafe fn test_mm_min_epu32() {
1439        let a = _mm_setr_epi32(1, 4, 5, 8);
1440        let b = _mm_setr_epi32(2, 3, 6, 7);
1441        let r = _mm_min_epu32(a, b);
1442        let e = _mm_setr_epi32(1, 3, 5, 7);
1443        assert_eq_m128i(r, e);
1444    }
1445
1446    #[simd_test(enable = "sse4.1")]
1447    unsafe fn test_mm_packus_epi32() {
1448        let a = _mm_setr_epi32(1, 2, 3, 4);
1449        let b = _mm_setr_epi32(-1, -2, -3, -4);
1450        let r = _mm_packus_epi32(a, b);
1451        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1452        assert_eq_m128i(r, e);
1453    }
1454
1455    #[simd_test(enable = "sse4.1")]
1456    unsafe fn test_mm_cmpeq_epi64() {
1457        let a = _mm_setr_epi64x(0, 1);
1458        let b = _mm_setr_epi64x(0, 0);
1459        let r = _mm_cmpeq_epi64(a, b);
1460        let e = _mm_setr_epi64x(-1, 0);
1461        assert_eq_m128i(r, e);
1462    }
1463
1464    #[simd_test(enable = "sse4.1")]
1465    unsafe fn test_mm_cvtepi8_epi16() {
1466        let a = _mm_set1_epi8(10);
1467        let r = _mm_cvtepi8_epi16(a);
1468        let e = _mm_set1_epi16(10);
1469        assert_eq_m128i(r, e);
1470        let a = _mm_set1_epi8(-10);
1471        let r = _mm_cvtepi8_epi16(a);
1472        let e = _mm_set1_epi16(-10);
1473        assert_eq_m128i(r, e);
1474    }
1475
1476    #[simd_test(enable = "sse4.1")]
1477    unsafe fn test_mm_cvtepi8_epi32() {
1478        let a = _mm_set1_epi8(10);
1479        let r = _mm_cvtepi8_epi32(a);
1480        let e = _mm_set1_epi32(10);
1481        assert_eq_m128i(r, e);
1482        let a = _mm_set1_epi8(-10);
1483        let r = _mm_cvtepi8_epi32(a);
1484        let e = _mm_set1_epi32(-10);
1485        assert_eq_m128i(r, e);
1486    }
1487
1488    #[simd_test(enable = "sse4.1")]
1489    unsafe fn test_mm_cvtepi8_epi64() {
1490        let a = _mm_set1_epi8(10);
1491        let r = _mm_cvtepi8_epi64(a);
1492        let e = _mm_set1_epi64x(10);
1493        assert_eq_m128i(r, e);
1494        let a = _mm_set1_epi8(-10);
1495        let r = _mm_cvtepi8_epi64(a);
1496        let e = _mm_set1_epi64x(-10);
1497        assert_eq_m128i(r, e);
1498    }
1499
1500    #[simd_test(enable = "sse4.1")]
1501    unsafe fn test_mm_cvtepi16_epi32() {
1502        let a = _mm_set1_epi16(10);
1503        let r = _mm_cvtepi16_epi32(a);
1504        let e = _mm_set1_epi32(10);
1505        assert_eq_m128i(r, e);
1506        let a = _mm_set1_epi16(-10);
1507        let r = _mm_cvtepi16_epi32(a);
1508        let e = _mm_set1_epi32(-10);
1509        assert_eq_m128i(r, e);
1510    }
1511
1512    #[simd_test(enable = "sse4.1")]
1513    unsafe fn test_mm_cvtepi16_epi64() {
1514        let a = _mm_set1_epi16(10);
1515        let r = _mm_cvtepi16_epi64(a);
1516        let e = _mm_set1_epi64x(10);
1517        assert_eq_m128i(r, e);
1518        let a = _mm_set1_epi16(-10);
1519        let r = _mm_cvtepi16_epi64(a);
1520        let e = _mm_set1_epi64x(-10);
1521        assert_eq_m128i(r, e);
1522    }
1523
1524    #[simd_test(enable = "sse4.1")]
1525    unsafe fn test_mm_cvtepi32_epi64() {
1526        let a = _mm_set1_epi32(10);
1527        let r = _mm_cvtepi32_epi64(a);
1528        let e = _mm_set1_epi64x(10);
1529        assert_eq_m128i(r, e);
1530        let a = _mm_set1_epi32(-10);
1531        let r = _mm_cvtepi32_epi64(a);
1532        let e = _mm_set1_epi64x(-10);
1533        assert_eq_m128i(r, e);
1534    }
1535
1536    #[simd_test(enable = "sse4.1")]
1537    unsafe fn test_mm_cvtepu8_epi16() {
1538        let a = _mm_set1_epi8(10);
1539        let r = _mm_cvtepu8_epi16(a);
1540        let e = _mm_set1_epi16(10);
1541        assert_eq_m128i(r, e);
1542    }
1543
1544    #[simd_test(enable = "sse4.1")]
1545    unsafe fn test_mm_cvtepu8_epi32() {
1546        let a = _mm_set1_epi8(10);
1547        let r = _mm_cvtepu8_epi32(a);
1548        let e = _mm_set1_epi32(10);
1549        assert_eq_m128i(r, e);
1550    }
1551
1552    #[simd_test(enable = "sse4.1")]
1553    unsafe fn test_mm_cvtepu8_epi64() {
1554        let a = _mm_set1_epi8(10);
1555        let r = _mm_cvtepu8_epi64(a);
1556        let e = _mm_set1_epi64x(10);
1557        assert_eq_m128i(r, e);
1558    }
1559
1560    #[simd_test(enable = "sse4.1")]
1561    unsafe fn test_mm_cvtepu16_epi32() {
1562        let a = _mm_set1_epi16(10);
1563        let r = _mm_cvtepu16_epi32(a);
1564        let e = _mm_set1_epi32(10);
1565        assert_eq_m128i(r, e);
1566    }
1567
1568    #[simd_test(enable = "sse4.1")]
1569    unsafe fn test_mm_cvtepu16_epi64() {
1570        let a = _mm_set1_epi16(10);
1571        let r = _mm_cvtepu16_epi64(a);
1572        let e = _mm_set1_epi64x(10);
1573        assert_eq_m128i(r, e);
1574    }
1575
1576    #[simd_test(enable = "sse4.1")]
1577    unsafe fn test_mm_cvtepu32_epi64() {
1578        let a = _mm_set1_epi32(10);
1579        let r = _mm_cvtepu32_epi64(a);
1580        let e = _mm_set1_epi64x(10);
1581        assert_eq_m128i(r, e);
1582    }
1583
1584    #[simd_test(enable = "sse4.1")]
1585    unsafe fn test_mm_dp_pd() {
1586        let a = _mm_setr_pd(2.0, 3.0);
1587        let b = _mm_setr_pd(1.0, 4.0);
1588        let e = _mm_setr_pd(14.0, 0.0);
1589        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1590    }
1591
1592    #[simd_test(enable = "sse4.1")]
1593    unsafe fn test_mm_dp_ps() {
1594        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1595        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1596        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1597        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1598    }
1599
1600    #[simd_test(enable = "sse4.1")]
1601    unsafe fn test_mm_floor_pd() {
1602        let a = _mm_setr_pd(2.5, 4.5);
1603        let r = _mm_floor_pd(a);
1604        let e = _mm_setr_pd(2.0, 4.0);
1605        assert_eq_m128d(r, e);
1606    }
1607
1608    #[simd_test(enable = "sse4.1")]
1609    unsafe fn test_mm_floor_ps() {
1610        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1611        let r = _mm_floor_ps(a);
1612        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1613        assert_eq_m128(r, e);
1614    }
1615
1616    #[simd_test(enable = "sse4.1")]
1617    unsafe fn test_mm_floor_sd() {
1618        let a = _mm_setr_pd(2.5, 4.5);
1619        let b = _mm_setr_pd(-1.5, -3.5);
1620        let r = _mm_floor_sd(a, b);
1621        let e = _mm_setr_pd(-2.0, 4.5);
1622        assert_eq_m128d(r, e);
1623    }
1624
1625    #[simd_test(enable = "sse4.1")]
1626    unsafe fn test_mm_floor_ss() {
1627        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1628        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1629        let r = _mm_floor_ss(a, b);
1630        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1631        assert_eq_m128(r, e);
1632    }
1633
1634    #[simd_test(enable = "sse4.1")]
1635    unsafe fn test_mm_ceil_pd() {
1636        let a = _mm_setr_pd(1.5, 3.5);
1637        let r = _mm_ceil_pd(a);
1638        let e = _mm_setr_pd(2.0, 4.0);
1639        assert_eq_m128d(r, e);
1640    }
1641
1642    #[simd_test(enable = "sse4.1")]
1643    unsafe fn test_mm_ceil_ps() {
1644        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1645        let r = _mm_ceil_ps(a);
1646        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1647        assert_eq_m128(r, e);
1648    }
1649
1650    #[simd_test(enable = "sse4.1")]
1651    unsafe fn test_mm_ceil_sd() {
1652        let a = _mm_setr_pd(1.5, 3.5);
1653        let b = _mm_setr_pd(-2.5, -4.5);
1654        let r = _mm_ceil_sd(a, b);
1655        let e = _mm_setr_pd(-2.0, 3.5);
1656        assert_eq_m128d(r, e);
1657    }
1658
1659    #[simd_test(enable = "sse4.1")]
1660    unsafe fn test_mm_ceil_ss() {
1661        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1662        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1663        let r = _mm_ceil_ss(a, b);
1664        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1665        assert_eq_m128(r, e);
1666    }
1667
1668    #[simd_test(enable = "sse4.1")]
1669    unsafe fn test_mm_round_pd() {
1670        let a = _mm_setr_pd(1.25, 3.75);
1671        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1672        let e = _mm_setr_pd(1.0, 4.0);
1673        assert_eq_m128d(r, e);
1674    }
1675
1676    #[simd_test(enable = "sse4.1")]
1677    unsafe fn test_mm_round_ps() {
1678        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1679        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1680        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1681        assert_eq_m128(r, e);
1682    }
1683
1684    #[simd_test(enable = "sse4.1")]
1685    unsafe fn test_mm_round_sd() {
1686        let a = _mm_setr_pd(1.5, 3.5);
1687        let b = _mm_setr_pd(-2.5, -4.5);
1688        let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1689        let e = _mm_setr_pd(-2.0, 3.5);
1690        assert_eq_m128d(r, e);
1691
1692        let a = _mm_setr_pd(1.5, 3.5);
1693        let b = _mm_setr_pd(-2.5, -4.5);
1694        let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
1695        let e = _mm_setr_pd(-3.0, 3.5);
1696        assert_eq_m128d(r, e);
1697
1698        let a = _mm_setr_pd(1.5, 3.5);
1699        let b = _mm_setr_pd(-2.5, -4.5);
1700        let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
1701        let e = _mm_setr_pd(-2.0, 3.5);
1702        assert_eq_m128d(r, e);
1703
1704        let a = _mm_setr_pd(1.5, 3.5);
1705        let b = _mm_setr_pd(-2.5, -4.5);
1706        let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
1707        let e = _mm_setr_pd(-2.0, 3.5);
1708        assert_eq_m128d(r, e);
1709    }
1710
1711    #[simd_test(enable = "sse4.1")]
1712    unsafe fn test_mm_round_ss() {
1713        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1714        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1715        let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1716        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1717        assert_eq_m128(r, e);
1718
1719        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1720        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1721        let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
1722        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1723        assert_eq_m128(r, e);
1724
1725        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1726        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1727        let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
1728        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1729        assert_eq_m128(r, e);
1730
1731        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1732        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1733        let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
1734        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1735        assert_eq_m128(r, e);
1736    }
1737
1738    #[simd_test(enable = "sse4.1")]
1739    unsafe fn test_mm_minpos_epu16_1() {
1740        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1741        let r = _mm_minpos_epu16(a);
1742        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1743        assert_eq_m128i(r, e);
1744    }
1745
1746    #[simd_test(enable = "sse4.1")]
1747    unsafe fn test_mm_minpos_epu16_2() {
1748        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1749        let r = _mm_minpos_epu16(a);
1750        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1751        assert_eq_m128i(r, e);
1752    }
1753
1754    #[simd_test(enable = "sse4.1")]
1755    unsafe fn test_mm_minpos_epu16_3() {
1756        // Case where the minimum value is repeated
1757        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
1758        let r = _mm_minpos_epu16(a);
1759        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1760        assert_eq_m128i(r, e);
1761    }
1762
1763    #[simd_test(enable = "sse4.1")]
1764    unsafe fn test_mm_mul_epi32() {
1765        {
1766            let a = _mm_setr_epi32(1, 1, 1, 1);
1767            let b = _mm_setr_epi32(1, 2, 3, 4);
1768            let r = _mm_mul_epi32(a, b);
1769            let e = _mm_setr_epi64x(1, 3);
1770            assert_eq_m128i(r, e);
1771        }
1772        {
1773            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1774            let b = _mm_setr_epi32(
1775                -20, -256, /* ignored */
1776                666666, 666666, /* ignored */
1777            );
1778            let r = _mm_mul_epi32(a, b);
1779            let e = _mm_setr_epi64x(-300, 823043843622);
1780            assert_eq_m128i(r, e);
1781        }
1782    }
1783
1784    #[simd_test(enable = "sse4.1")]
1785    unsafe fn test_mm_mullo_epi32() {
1786        {
1787            let a = _mm_setr_epi32(1, 1, 1, 1);
1788            let b = _mm_setr_epi32(1, 2, 3, 4);
1789            let r = _mm_mullo_epi32(a, b);
1790            let e = _mm_setr_epi32(1, 2, 3, 4);
1791            assert_eq_m128i(r, e);
1792        }
1793        {
1794            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1795            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1796            let r = _mm_mullo_epi32(a, b);
1797            // Attention, most significant bit in r[2] is treated
1798            // as a sign bit:
1799            // 1234567 * 666666 = -1589877210
1800            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1801            assert_eq_m128i(r, e);
1802        }
1803    }
1804
1805    #[simd_test(enable = "sse4.1")]
1806    unsafe fn test_mm_minpos_epu16() {
1807        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1808        let r = _mm_minpos_epu16(a);
1809        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1810        assert_eq_m128i(r, e);
1811    }
1812
1813    #[simd_test(enable = "sse4.1")]
1814    unsafe fn test_mm_mpsadbw_epu8() {
1815        #[rustfmt::skip]
1816        let a = _mm_setr_epi8(
1817            0, 1, 2, 3, 4, 5, 6, 7,
1818            8, 9, 10, 11, 12, 13, 14, 15,
1819        );
1820
1821        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1822        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1823        assert_eq_m128i(r, e);
1824
1825        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1826        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1827        assert_eq_m128i(r, e);
1828
1829        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1830        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1831        assert_eq_m128i(r, e);
1832
1833        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1834        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1835        assert_eq_m128i(r, e);
1836
1837        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1838        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1839        assert_eq_m128i(r, e);
1840    }
1841
1842    #[simd_test(enable = "sse4.1")]
1843    unsafe fn test_mm_testz_si128() {
1844        let a = _mm_set1_epi8(1);
1845        let mask = _mm_set1_epi8(0);
1846        let r = _mm_testz_si128(a, mask);
1847        assert_eq!(r, 1);
1848        let a = _mm_set1_epi8(0b101);
1849        let mask = _mm_set1_epi8(0b110);
1850        let r = _mm_testz_si128(a, mask);
1851        assert_eq!(r, 0);
1852        let a = _mm_set1_epi8(0b011);
1853        let mask = _mm_set1_epi8(0b100);
1854        let r = _mm_testz_si128(a, mask);
1855        assert_eq!(r, 1);
1856    }
1857
1858    #[simd_test(enable = "sse4.1")]
1859    unsafe fn test_mm_testc_si128() {
1860        let a = _mm_set1_epi8(-1);
1861        let mask = _mm_set1_epi8(0);
1862        let r = _mm_testc_si128(a, mask);
1863        assert_eq!(r, 1);
1864        let a = _mm_set1_epi8(0b101);
1865        let mask = _mm_set1_epi8(0b110);
1866        let r = _mm_testc_si128(a, mask);
1867        assert_eq!(r, 0);
1868        let a = _mm_set1_epi8(0b101);
1869        let mask = _mm_set1_epi8(0b100);
1870        let r = _mm_testc_si128(a, mask);
1871        assert_eq!(r, 1);
1872    }
1873
1874    #[simd_test(enable = "sse4.1")]
1875    unsafe fn test_mm_testnzc_si128() {
1876        let a = _mm_set1_epi8(0);
1877        let mask = _mm_set1_epi8(1);
1878        let r = _mm_testnzc_si128(a, mask);
1879        assert_eq!(r, 0);
1880        let a = _mm_set1_epi8(-1);
1881        let mask = _mm_set1_epi8(0);
1882        let r = _mm_testnzc_si128(a, mask);
1883        assert_eq!(r, 0);
1884        let a = _mm_set1_epi8(0b101);
1885        let mask = _mm_set1_epi8(0b110);
1886        let r = _mm_testnzc_si128(a, mask);
1887        assert_eq!(r, 1);
1888        let a = _mm_set1_epi8(0b101);
1889        let mask = _mm_set1_epi8(0b101);
1890        let r = _mm_testnzc_si128(a, mask);
1891        assert_eq!(r, 0);
1892    }
1893
1894    #[simd_test(enable = "sse4.1")]
1895    unsafe fn test_mm_test_all_zeros() {
1896        let a = _mm_set1_epi8(1);
1897        let mask = _mm_set1_epi8(0);
1898        let r = _mm_test_all_zeros(a, mask);
1899        assert_eq!(r, 1);
1900        let a = _mm_set1_epi8(0b101);
1901        let mask = _mm_set1_epi8(0b110);
1902        let r = _mm_test_all_zeros(a, mask);
1903        assert_eq!(r, 0);
1904        let a = _mm_set1_epi8(0b011);
1905        let mask = _mm_set1_epi8(0b100);
1906        let r = _mm_test_all_zeros(a, mask);
1907        assert_eq!(r, 1);
1908    }
1909
1910    #[simd_test(enable = "sse4.1")]
1911    unsafe fn test_mm_test_all_ones() {
1912        let a = _mm_set1_epi8(-1);
1913        let r = _mm_test_all_ones(a);
1914        assert_eq!(r, 1);
1915        let a = _mm_set1_epi8(0b101);
1916        let r = _mm_test_all_ones(a);
1917        assert_eq!(r, 0);
1918    }
1919
1920    #[simd_test(enable = "sse4.1")]
1921    unsafe fn test_mm_test_mix_ones_zeros() {
1922        let a = _mm_set1_epi8(0);
1923        let mask = _mm_set1_epi8(1);
1924        let r = _mm_test_mix_ones_zeros(a, mask);
1925        assert_eq!(r, 0);
1926        let a = _mm_set1_epi8(-1);
1927        let mask = _mm_set1_epi8(0);
1928        let r = _mm_test_mix_ones_zeros(a, mask);
1929        assert_eq!(r, 0);
1930        let a = _mm_set1_epi8(0b101);
1931        let mask = _mm_set1_epi8(0b110);
1932        let r = _mm_test_mix_ones_zeros(a, mask);
1933        assert_eq!(r, 1);
1934        let a = _mm_set1_epi8(0b101);
1935        let mask = _mm_set1_epi8(0b101);
1936        let r = _mm_test_mix_ones_zeros(a, mask);
1937        assert_eq!(r, 0);
1938    }
1939
1940    #[simd_test(enable = "sse4.1")]
1941    unsafe fn test_mm_stream_load_si128() {
1942        let a = _mm_set_epi64x(5, 6);
1943        let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
1944        assert_eq_m128i(a, r);
1945    }
1946}