core/stdarch/crates/core_arch/src/x86/
sse.rs

1//! Streaming SIMD Extensions (SSE)
2
3use crate::{
4    core_arch::{simd::*, x86::*},
5    intrinsics::simd::*,
6    intrinsics::sqrtf32,
7    mem, ptr,
8};
9
10#[cfg(test)]
11use stdarch_test::assert_instr;
12
13/// Adds the first component of `a` and `b`, the other components are copied
14/// from `a`.
15///
16/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
17#[inline]
18#[target_feature(enable = "sse")]
19#[cfg_attr(test, assert_instr(addss))]
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
22    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) }
23}
24
25/// Adds packed single-precision (32-bit) floating-point elements in `a` and
26/// `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
29#[inline]
30#[target_feature(enable = "sse")]
31#[cfg_attr(test, assert_instr(addps))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
34    unsafe { simd_add(a, b) }
35}
36
37/// Subtracts the first component of `b` from `a`, the other components are
38/// copied from `a`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
41#[inline]
42#[target_feature(enable = "sse")]
43#[cfg_attr(test, assert_instr(subss))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
46    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) }
47}
48
49/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
50/// `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
53#[inline]
54#[target_feature(enable = "sse")]
55#[cfg_attr(test, assert_instr(subps))]
56#[stable(feature = "simd_x86", since = "1.27.0")]
57pub fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
58    unsafe { simd_sub(a, b) }
59}
60
61/// Multiplies the first component of `a` and `b`, the other components are
62/// copied from `a`.
63///
64/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
65#[inline]
66#[target_feature(enable = "sse")]
67#[cfg_attr(test, assert_instr(mulss))]
68#[stable(feature = "simd_x86", since = "1.27.0")]
69pub fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
70    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) }
71}
72
73/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
74/// `b`.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
77#[inline]
78#[target_feature(enable = "sse")]
79#[cfg_attr(test, assert_instr(mulps))]
80#[stable(feature = "simd_x86", since = "1.27.0")]
81pub fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
82    unsafe { simd_mul(a, b) }
83}
84
85/// Divides the first component of `b` by `a`, the other components are
86/// copied from `a`.
87///
88/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
89#[inline]
90#[target_feature(enable = "sse")]
91#[cfg_attr(test, assert_instr(divss))]
92#[stable(feature = "simd_x86", since = "1.27.0")]
93pub fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
94    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) }
95}
96
97/// Divides packed single-precision (32-bit) floating-point elements in `a` and
98/// `b`.
99///
100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
101#[inline]
102#[target_feature(enable = "sse")]
103#[cfg_attr(test, assert_instr(divps))]
104#[stable(feature = "simd_x86", since = "1.27.0")]
105pub fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
106    unsafe { simd_div(a, b) }
107}
108
109/// Returns the square root of the first single-precision (32-bit)
110/// floating-point element in `a`, the other elements are unchanged.
111///
112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
113#[inline]
114#[target_feature(enable = "sse")]
115#[cfg_attr(test, assert_instr(sqrtss))]
116#[stable(feature = "simd_x86", since = "1.27.0")]
117pub fn _mm_sqrt_ss(a: __m128) -> __m128 {
118    unsafe { simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a))) }
119}
120
121/// Returns the square root of packed single-precision (32-bit) floating-point
122/// elements in `a`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
125#[inline]
126#[target_feature(enable = "sse")]
127#[cfg_attr(test, assert_instr(sqrtps))]
128#[stable(feature = "simd_x86", since = "1.27.0")]
129pub fn _mm_sqrt_ps(a: __m128) -> __m128 {
130    unsafe { simd_fsqrt(a) }
131}
132
133/// Returns the approximate reciprocal of the first single-precision
134/// (32-bit) floating-point element in `a`, the other elements are unchanged.
135///
136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
137#[inline]
138#[target_feature(enable = "sse")]
139#[cfg_attr(test, assert_instr(rcpss))]
140#[stable(feature = "simd_x86", since = "1.27.0")]
141pub fn _mm_rcp_ss(a: __m128) -> __m128 {
142    unsafe { rcpss(a) }
143}
144
145/// Returns the approximate reciprocal of packed single-precision (32-bit)
146/// floating-point elements in `a`.
147///
148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
149#[inline]
150#[target_feature(enable = "sse")]
151#[cfg_attr(test, assert_instr(rcpps))]
152#[stable(feature = "simd_x86", since = "1.27.0")]
153pub fn _mm_rcp_ps(a: __m128) -> __m128 {
154    unsafe { rcpps(a) }
155}
156
157/// Returns the approximate reciprocal square root of the first single-precision
158/// (32-bit) floating-point element in `a`, the other elements are unchanged.
159///
160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
161#[inline]
162#[target_feature(enable = "sse")]
163#[cfg_attr(test, assert_instr(rsqrtss))]
164#[stable(feature = "simd_x86", since = "1.27.0")]
165pub fn _mm_rsqrt_ss(a: __m128) -> __m128 {
166    unsafe { rsqrtss(a) }
167}
168
169/// Returns the approximate reciprocal square root of packed single-precision
170/// (32-bit) floating-point elements in `a`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
173#[inline]
174#[target_feature(enable = "sse")]
175#[cfg_attr(test, assert_instr(rsqrtps))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub fn _mm_rsqrt_ps(a: __m128) -> __m128 {
178    unsafe { rsqrtps(a) }
179}
180
181/// Compares the first single-precision (32-bit) floating-point element of `a`
182/// and `b`, and return the minimum value in the first element of the return
183/// value, the other elements are copied from `a`.
184///
185/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
186#[inline]
187#[target_feature(enable = "sse")]
188#[cfg_attr(test, assert_instr(minss))]
189#[stable(feature = "simd_x86", since = "1.27.0")]
190pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
191    unsafe { minss(a, b) }
192}
193
194/// Compares packed single-precision (32-bit) floating-point elements in `a` and
195/// `b`, and return the corresponding minimum values.
196///
197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
198#[inline]
199#[target_feature(enable = "sse")]
200#[cfg_attr(test, assert_instr(minps))]
201#[stable(feature = "simd_x86", since = "1.27.0")]
202pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
203    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
204    unsafe { minps(a, b) }
205}
206
207/// Compares the first single-precision (32-bit) floating-point element of `a`
208/// and `b`, and return the maximum value in the first element of the return
209/// value, the other elements are copied from `a`.
210///
211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
212#[inline]
213#[target_feature(enable = "sse")]
214#[cfg_attr(test, assert_instr(maxss))]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
217    unsafe { maxss(a, b) }
218}
219
220/// Compares packed single-precision (32-bit) floating-point elements in `a` and
221/// `b`, and return the corresponding maximum values.
222///
223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
224#[inline]
225#[target_feature(enable = "sse")]
226#[cfg_attr(test, assert_instr(maxps))]
227#[stable(feature = "simd_x86", since = "1.27.0")]
228pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
229    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
230    unsafe { maxps(a, b) }
231}
232
233/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
234///
235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
236#[inline]
237#[target_feature(enable = "sse")]
238// i586 only seems to generate plain `and` instructions, so ignore it.
239#[cfg_attr(
240    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
241    assert_instr(andps)
242)]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
245    unsafe {
246        let a: __m128i = mem::transmute(a);
247        let b: __m128i = mem::transmute(b);
248        mem::transmute(simd_and(a, b))
249    }
250}
251
252/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
253/// elements.
254///
255/// Computes `!a & b` for each bit in `a` and `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
258#[inline]
259#[target_feature(enable = "sse")]
260// i586 only seems to generate plain `not` and `and` instructions, so ignore
261// it.
262#[cfg_attr(
263    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
264    assert_instr(andnps)
265)]
266#[stable(feature = "simd_x86", since = "1.27.0")]
267pub fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
268    unsafe {
269        let a: __m128i = mem::transmute(a);
270        let b: __m128i = mem::transmute(b);
271        let mask: __m128i = mem::transmute(i32x4::splat(-1));
272        mem::transmute(simd_and(simd_xor(mask, a), b))
273    }
274}
275
276/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
277///
278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
279#[inline]
280#[target_feature(enable = "sse")]
281// i586 only seems to generate plain `or` instructions, so we ignore it.
282#[cfg_attr(
283    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
284    assert_instr(orps)
285)]
286#[stable(feature = "simd_x86", since = "1.27.0")]
287pub fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
288    unsafe {
289        let a: __m128i = mem::transmute(a);
290        let b: __m128i = mem::transmute(b);
291        mem::transmute(simd_or(a, b))
292    }
293}
294
295/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
296/// elements.
297///
298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
299#[inline]
300#[target_feature(enable = "sse")]
301// i586 only seems to generate plain `xor` instructions, so we ignore it.
302#[cfg_attr(
303    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
304    assert_instr(xorps)
305)]
306#[stable(feature = "simd_x86", since = "1.27.0")]
307pub fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
308    unsafe {
309        let a: __m128i = mem::transmute(a);
310        let b: __m128i = mem::transmute(b);
311        mem::transmute(simd_xor(a, b))
312    }
313}
314
315/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
316/// the result will be `0xffffffff` if the two inputs are equal, or `0`
317/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
318///
319/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
320#[inline]
321#[target_feature(enable = "sse")]
322#[cfg_attr(test, assert_instr(cmpeqss))]
323#[stable(feature = "simd_x86", since = "1.27.0")]
324pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
325    unsafe { cmpss(a, b, 0) }
326}
327
328/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
329/// of the result will be `0xffffffff` if `a.extract(0)` is less than
330/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
331/// upper 96 bits of `a`.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
334#[inline]
335#[target_feature(enable = "sse")]
336#[cfg_attr(test, assert_instr(cmpltss))]
337#[stable(feature = "simd_x86", since = "1.27.0")]
338pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
339    unsafe { cmpss(a, b, 1) }
340}
341
342/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
343/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
344/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345/// are the upper 96 bits of `a`.
346///
347/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
348#[inline]
349#[target_feature(enable = "sse")]
350#[cfg_attr(test, assert_instr(cmpless))]
351#[stable(feature = "simd_x86", since = "1.27.0")]
352pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
353    unsafe { cmpss(a, b, 2) }
354}
355
356/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
357/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
358/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
359/// are the upper 96 bits of `a`.
360///
361/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
362#[inline]
363#[target_feature(enable = "sse")]
364#[cfg_attr(test, assert_instr(cmpltss))]
365#[stable(feature = "simd_x86", since = "1.27.0")]
366pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
367    unsafe { simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) }
368}
369
370/// Compares the lowest `f32` of both inputs for greater than or equal. The
371/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
372/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
373/// of the result are the upper 96 bits of `a`.
374///
375/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
376#[inline]
377#[target_feature(enable = "sse")]
378#[cfg_attr(test, assert_instr(cmpless))]
379#[stable(feature = "simd_x86", since = "1.27.0")]
380pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
381    unsafe { simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) }
382}
383
384/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
385/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
386/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387/// upper 96 bits of `a`.
388///
389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
390#[inline]
391#[target_feature(enable = "sse")]
392#[cfg_attr(test, assert_instr(cmpneqss))]
393#[stable(feature = "simd_x86", since = "1.27.0")]
394pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
395    unsafe { cmpss(a, b, 4) }
396}
397
398/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
399/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
400/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
401/// upper 96 bits of `a`.
402///
403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
404#[inline]
405#[target_feature(enable = "sse")]
406#[cfg_attr(test, assert_instr(cmpnltss))]
407#[stable(feature = "simd_x86", since = "1.27.0")]
408pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
409    unsafe { cmpss(a, b, 5) }
410}
411
412/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
413/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
414/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
415/// of the result are the upper 96 bits of `a`.
416///
417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
418#[inline]
419#[target_feature(enable = "sse")]
420#[cfg_attr(test, assert_instr(cmpnless))]
421#[stable(feature = "simd_x86", since = "1.27.0")]
422pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
423    unsafe { cmpss(a, b, 6) }
424}
425
426/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
427/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
428/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
429/// the upper 96 bits of `a`.
430///
431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
432#[inline]
433#[target_feature(enable = "sse")]
434#[cfg_attr(test, assert_instr(cmpnltss))]
435#[stable(feature = "simd_x86", since = "1.27.0")]
436pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
437    unsafe { simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) }
438}
439
440/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
441/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
442/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
443/// bits of the result are the upper 96 bits of `a`.
444///
445/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
446#[inline]
447#[target_feature(enable = "sse")]
448#[cfg_attr(test, assert_instr(cmpnless))]
449#[stable(feature = "simd_x86", since = "1.27.0")]
450pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
451    unsafe { simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) }
452}
453
454/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
455/// the result will be `0xffffffff` if neither of `a.extract(0)` or
456/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457/// are the upper 96 bits of `a`.
458///
459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
460#[inline]
461#[target_feature(enable = "sse")]
462#[cfg_attr(test, assert_instr(cmpordss))]
463#[stable(feature = "simd_x86", since = "1.27.0")]
464pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
465    unsafe { cmpss(a, b, 7) }
466}
467
468/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
469/// of the result will be `0xffffffff` if any of `a.extract(0)` or
470/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
471/// are the upper 96 bits of `a`.
472///
473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
474#[inline]
475#[target_feature(enable = "sse")]
476#[cfg_attr(test, assert_instr(cmpunordss))]
477#[stable(feature = "simd_x86", since = "1.27.0")]
478pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
479    unsafe { cmpss(a, b, 3) }
480}
481
482/// Compares each of the four floats in `a` to the corresponding element in `b`.
483/// The result in the output vector will be `0xffffffff` if the input elements
484/// were equal, or `0` otherwise.
485///
486/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
487#[inline]
488#[target_feature(enable = "sse")]
489#[cfg_attr(test, assert_instr(cmpeqps))]
490#[stable(feature = "simd_x86", since = "1.27.0")]
491pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
492    unsafe { cmpps(a, b, 0) }
493}
494
495/// Compares each of the four floats in `a` to the corresponding element in `b`.
496/// The result in the output vector will be `0xffffffff` if the input element
497/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
498///
499/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
500#[inline]
501#[target_feature(enable = "sse")]
502#[cfg_attr(test, assert_instr(cmpltps))]
503#[stable(feature = "simd_x86", since = "1.27.0")]
504pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
505    unsafe { cmpps(a, b, 1) }
506}
507
508/// Compares each of the four floats in `a` to the corresponding element in `b`.
509/// The result in the output vector will be `0xffffffff` if the input element
510/// in `a` is less than or equal to the corresponding element in `b`, or `0`
511/// otherwise.
512///
513/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
514#[inline]
515#[target_feature(enable = "sse")]
516#[cfg_attr(test, assert_instr(cmpleps))]
517#[stable(feature = "simd_x86", since = "1.27.0")]
518pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
519    unsafe { cmpps(a, b, 2) }
520}
521
522/// Compares each of the four floats in `a` to the corresponding element in `b`.
523/// The result in the output vector will be `0xffffffff` if the input element
524/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
525///
526/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
527#[inline]
528#[target_feature(enable = "sse")]
529#[cfg_attr(test, assert_instr(cmpltps))]
530#[stable(feature = "simd_x86", since = "1.27.0")]
531pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
532    unsafe { cmpps(b, a, 1) }
533}
534
535/// Compares each of the four floats in `a` to the corresponding element in `b`.
536/// The result in the output vector will be `0xffffffff` if the input element
537/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
538/// otherwise.
539///
540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
541#[inline]
542#[target_feature(enable = "sse")]
543#[cfg_attr(test, assert_instr(cmpleps))]
544#[stable(feature = "simd_x86", since = "1.27.0")]
545pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
546    unsafe { cmpps(b, a, 2) }
547}
548
549/// Compares each of the four floats in `a` to the corresponding element in `b`.
550/// The result in the output vector will be `0xffffffff` if the input elements
551/// are **not** equal, or `0` otherwise.
552///
553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
554#[inline]
555#[target_feature(enable = "sse")]
556#[cfg_attr(test, assert_instr(cmpneqps))]
557#[stable(feature = "simd_x86", since = "1.27.0")]
558pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
559    unsafe { cmpps(a, b, 4) }
560}
561
562/// Compares each of the four floats in `a` to the corresponding element in `b`.
563/// The result in the output vector will be `0xffffffff` if the input element
564/// in `a` is **not** less than the corresponding element in `b`, or `0`
565/// otherwise.
566///
567/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
568#[inline]
569#[target_feature(enable = "sse")]
570#[cfg_attr(test, assert_instr(cmpnltps))]
571#[stable(feature = "simd_x86", since = "1.27.0")]
572pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
573    unsafe { cmpps(a, b, 5) }
574}
575
576/// Compares each of the four floats in `a` to the corresponding element in `b`.
577/// The result in the output vector will be `0xffffffff` if the input element
578/// in `a` is **not** less than or equal to the corresponding element in `b`, or
579/// `0` otherwise.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
582#[inline]
583#[target_feature(enable = "sse")]
584#[cfg_attr(test, assert_instr(cmpnleps))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
587    unsafe { cmpps(a, b, 6) }
588}
589
590/// Compares each of the four floats in `a` to the corresponding element in `b`.
591/// The result in the output vector will be `0xffffffff` if the input element
592/// in `a` is **not** greater than the corresponding element in `b`, or `0`
593/// otherwise.
594///
595/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
596#[inline]
597#[target_feature(enable = "sse")]
598#[cfg_attr(test, assert_instr(cmpnltps))]
599#[stable(feature = "simd_x86", since = "1.27.0")]
600pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
601    unsafe { cmpps(b, a, 5) }
602}
603
604/// Compares each of the four floats in `a` to the corresponding element in `b`.
605/// The result in the output vector will be `0xffffffff` if the input element
606/// in `a` is **not** greater than or equal to the corresponding element in `b`,
607/// or `0` otherwise.
608///
609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
610#[inline]
611#[target_feature(enable = "sse")]
612#[cfg_attr(test, assert_instr(cmpnleps))]
613#[stable(feature = "simd_x86", since = "1.27.0")]
614pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
615    unsafe { cmpps(b, a, 6) }
616}
617
618/// Compares each of the four floats in `a` to the corresponding element in `b`.
619/// Returns four floats that have one of two possible bit patterns. The element
620/// in the output vector will be `0xffffffff` if the input elements in `a` and
621/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
622///
623/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
624#[inline]
625#[target_feature(enable = "sse")]
626#[cfg_attr(test, assert_instr(cmpordps))]
627#[stable(feature = "simd_x86", since = "1.27.0")]
628pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
629    unsafe { cmpps(b, a, 7) }
630}
631
632/// Compares each of the four floats in `a` to the corresponding element in `b`.
633/// Returns four floats that have one of two possible bit patterns. The element
634/// in the output vector will be `0xffffffff` if the input elements in `a` and
635/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
636///
637/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
638#[inline]
639#[target_feature(enable = "sse")]
640#[cfg_attr(test, assert_instr(cmpunordps))]
641#[stable(feature = "simd_x86", since = "1.27.0")]
642pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
643    unsafe { cmpps(b, a, 3) }
644}
645
646/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
647/// `1` if they are equal, or `0` otherwise.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
650#[inline]
651#[target_feature(enable = "sse")]
652#[cfg_attr(test, assert_instr(comiss))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
655    unsafe { comieq_ss(a, b) }
656}
657
658/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
659/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
660///
661/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
662#[inline]
663#[target_feature(enable = "sse")]
664#[cfg_attr(test, assert_instr(comiss))]
665#[stable(feature = "simd_x86", since = "1.27.0")]
666pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
667    unsafe { comilt_ss(a, b) }
668}
669
670/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
671/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
672/// otherwise.
673///
674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
675#[inline]
676#[target_feature(enable = "sse")]
677#[cfg_attr(test, assert_instr(comiss))]
678#[stable(feature = "simd_x86", since = "1.27.0")]
679pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
680    unsafe { comile_ss(a, b) }
681}
682
683/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
684/// `1` if the value from `a` is greater than the one from `b`, or `0`
685/// otherwise.
686///
687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
688#[inline]
689#[target_feature(enable = "sse")]
690#[cfg_attr(test, assert_instr(comiss))]
691#[stable(feature = "simd_x86", since = "1.27.0")]
692pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
693    unsafe { comigt_ss(a, b) }
694}
695
696/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
697/// `1` if the value from `a` is greater than or equal to the one from `b`, or
698/// `0` otherwise.
699///
700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
701#[inline]
702#[target_feature(enable = "sse")]
703#[cfg_attr(test, assert_instr(comiss))]
704#[stable(feature = "simd_x86", since = "1.27.0")]
705pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
706    unsafe { comige_ss(a, b) }
707}
708
709/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
710/// `1` if they are **not** equal, or `0` otherwise.
711///
712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
713#[inline]
714#[target_feature(enable = "sse")]
715#[cfg_attr(test, assert_instr(comiss))]
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
718    unsafe { comineq_ss(a, b) }
719}
720
721/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
722/// `1` if they are equal, or `0` otherwise. This instruction will not signal
723/// an exception if either argument is a quiet NaN.
724///
725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
726#[inline]
727#[target_feature(enable = "sse")]
728#[cfg_attr(test, assert_instr(ucomiss))]
729#[stable(feature = "simd_x86", since = "1.27.0")]
730pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
731    unsafe { ucomieq_ss(a, b) }
732}
733
734/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
735/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
736/// This instruction will not signal an exception if either argument is a quiet
737/// NaN.
738///
739/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
740#[inline]
741#[target_feature(enable = "sse")]
742#[cfg_attr(test, assert_instr(ucomiss))]
743#[stable(feature = "simd_x86", since = "1.27.0")]
744pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
745    unsafe { ucomilt_ss(a, b) }
746}
747
748/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
749/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
750/// otherwise. This instruction will not signal an exception if either argument
751/// is a quiet NaN.
752///
753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
754#[inline]
755#[target_feature(enable = "sse")]
756#[cfg_attr(test, assert_instr(ucomiss))]
757#[stable(feature = "simd_x86", since = "1.27.0")]
758pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
759    unsafe { ucomile_ss(a, b) }
760}
761
762/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
763/// `1` if the value from `a` is greater than the one from `b`, or `0`
764/// otherwise. This instruction will not signal an exception if either argument
765/// is a quiet NaN.
766///
767/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
768#[inline]
769#[target_feature(enable = "sse")]
770#[cfg_attr(test, assert_instr(ucomiss))]
771#[stable(feature = "simd_x86", since = "1.27.0")]
772pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
773    unsafe { ucomigt_ss(a, b) }
774}
775
776/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777/// `1` if the value from `a` is greater than or equal to the one from `b`, or
778/// `0` otherwise. This instruction will not signal an exception if either
779/// argument is a quiet NaN.
780///
781/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
782#[inline]
783#[target_feature(enable = "sse")]
784#[cfg_attr(test, assert_instr(ucomiss))]
785#[stable(feature = "simd_x86", since = "1.27.0")]
786pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
787    unsafe { ucomige_ss(a, b) }
788}
789
790/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
791/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
792/// signal an exception if either argument is a quiet NaN.
793///
794/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
795#[inline]
796#[target_feature(enable = "sse")]
797#[cfg_attr(test, assert_instr(ucomiss))]
798#[stable(feature = "simd_x86", since = "1.27.0")]
799pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
800    unsafe { ucomineq_ss(a, b) }
801}
802
803/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
804///
805/// The result is rounded according to the current rounding mode. If the result
806/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
807/// (`i32::MIN`).
808///
809/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
810///
811/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
812#[inline]
813#[target_feature(enable = "sse")]
814#[cfg_attr(test, assert_instr(cvtss2si))]
815#[stable(feature = "simd_x86", since = "1.27.0")]
816pub fn _mm_cvtss_si32(a: __m128) -> i32 {
817    unsafe { cvtss2si(a) }
818}
819
820/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
821///
822/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
823#[inline]
824#[target_feature(enable = "sse")]
825#[cfg_attr(test, assert_instr(cvtss2si))]
826#[stable(feature = "simd_x86", since = "1.27.0")]
827pub fn _mm_cvt_ss2si(a: __m128) -> i32 {
828    _mm_cvtss_si32(a)
829}
830
831/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
832/// with
833/// truncation.
834///
835/// The result is rounded always using truncation (round towards zero). If the
836/// result cannot be represented as a 32 bit integer the result will be
837/// `0x8000_0000` (`i32::MIN`).
838///
839/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
840///
841/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
842#[inline]
843#[target_feature(enable = "sse")]
844#[cfg_attr(test, assert_instr(cvttss2si))]
845#[stable(feature = "simd_x86", since = "1.27.0")]
846pub fn _mm_cvttss_si32(a: __m128) -> i32 {
847    unsafe { cvttss2si(a) }
848}
849
850/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
851///
852/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
853#[inline]
854#[target_feature(enable = "sse")]
855#[cfg_attr(test, assert_instr(cvttss2si))]
856#[stable(feature = "simd_x86", since = "1.27.0")]
857pub fn _mm_cvtt_ss2si(a: __m128) -> i32 {
858    _mm_cvttss_si32(a)
859}
860
861/// Extracts the lowest 32 bit float from the input vector.
862///
863/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
864#[inline]
865#[target_feature(enable = "sse")]
866// No point in using assert_instrs. In Unix x86_64 calling convention this is a
867// no-op, and on msvc it's just a `mov`.
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm_cvtss_f32(a: __m128) -> f32 {
870    unsafe { simd_extract!(a, 0) }
871}
872
873/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
874/// vector `a` with the lowest 32 bit float replaced by the converted integer.
875///
876/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
877/// input).
878///
879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
880#[inline]
881#[target_feature(enable = "sse")]
882#[cfg_attr(test, assert_instr(cvtsi2ss))]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
885    unsafe { simd_insert!(a, 0, b as f32) }
886}
887
888/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
889///
890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
891#[inline]
892#[target_feature(enable = "sse")]
893#[cfg_attr(test, assert_instr(cvtsi2ss))]
894#[stable(feature = "simd_x86", since = "1.27.0")]
895pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
896    _mm_cvtsi32_ss(a, b)
897}
898
899/// Construct a `__m128` with the lowest element set to `a` and the rest set to
900/// zero.
901///
902/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
903#[inline]
904#[target_feature(enable = "sse")]
905#[cfg_attr(test, assert_instr(movss))]
906#[stable(feature = "simd_x86", since = "1.27.0")]
907pub fn _mm_set_ss(a: f32) -> __m128 {
908    __m128([a, 0.0, 0.0, 0.0])
909}
910
911/// Construct a `__m128` with all element set to `a`.
912///
913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
914#[inline]
915#[target_feature(enable = "sse")]
916#[cfg_attr(test, assert_instr(shufps))]
917#[stable(feature = "simd_x86", since = "1.27.0")]
918pub fn _mm_set1_ps(a: f32) -> __m128 {
919    __m128([a, a, a, a])
920}
921
922/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
923///
924/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
925#[inline]
926#[target_feature(enable = "sse")]
927#[cfg_attr(test, assert_instr(shufps))]
928#[stable(feature = "simd_x86", since = "1.27.0")]
929pub fn _mm_set_ps1(a: f32) -> __m128 {
930    _mm_set1_ps(a)
931}
932
933/// Construct a `__m128` from four floating point values highest to lowest.
934///
935/// Note that `a` will be the highest 32 bits of the result, and `d` the
936/// lowest. This matches the standard way of writing bit patterns on x86:
937///
938/// ```text
939///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
940///        +---------+---------+---------+---------+
941///        |    a    |    b    |    c    |    d    |   result
942///        +---------+---------+---------+---------+
943/// ```
944///
945/// Alternatively:
946///
947/// ```text
948/// let v = _mm_set_ps(d, c, b, a);
949/// ```
950///
951/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
952#[inline]
953#[target_feature(enable = "sse")]
954#[cfg_attr(test, assert_instr(unpcklps))]
955#[stable(feature = "simd_x86", since = "1.27.0")]
956pub fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
957    __m128([d, c, b, a])
958}
959
960/// Construct a `__m128` from four floating point values lowest to highest.
961///
962/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
963/// bits of the result, and `d` the highest.
964///
965/// ```text
966/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
967/// ```
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
970#[inline]
971#[target_feature(enable = "sse")]
972#[cfg_attr(
973    all(test, any(target_env = "msvc", target_arch = "x86_64")),
974    assert_instr(unpcklps)
975)]
976// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
977#[cfg_attr(
978    all(test, all(not(target_env = "msvc"), target_arch = "x86")),
979    assert_instr(movaps)
980)]
981#[stable(feature = "simd_x86", since = "1.27.0")]
982pub fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
983    __m128([a, b, c, d])
984}
985
986/// Construct a `__m128` with all elements initialized to zero.
987///
988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
989#[inline]
990#[target_feature(enable = "sse")]
991#[cfg_attr(test, assert_instr(xorps))]
992#[stable(feature = "simd_x86", since = "1.27.0")]
993pub fn _mm_setzero_ps() -> __m128 {
994    const { unsafe { mem::zeroed() } }
995}
996
997/// A utility function for creating masks to use with Intel shuffle and
998/// permute intrinsics.
999#[inline]
1000#[allow(non_snake_case)]
1001#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
1002pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
1003    ((z << 6) | (y << 4) | (x << 2) | w) as i32
1004}
1005
1006/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
1007/// `b` using `MASK`.
1008///
1009/// The lower half of result takes values from `a` and the higher half from
1010/// `b`. Mask is split to 2 control bits each to index the element from inputs.
1011///
1012/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1013///
1014/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1015/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1016/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1017/// Performing an implicit type conversion between an unsigned integer and a signed integer
1018/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1019#[inline]
1020#[target_feature(enable = "sse")]
1021#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1022#[rustc_legacy_const_generics(2)]
1023#[stable(feature = "simd_x86", since = "1.27.0")]
1024pub fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1025    static_assert_uimm_bits!(MASK, 8);
1026    unsafe {
1027        simd_shuffle!(
1028            a,
1029            b,
1030            [
1031                MASK as u32 & 0b11,
1032                (MASK as u32 >> 2) & 0b11,
1033                ((MASK as u32 >> 4) & 0b11) + 4,
1034                ((MASK as u32 >> 6) & 0b11) + 4,
1035            ],
1036        )
1037    }
1038}
1039
1040/// Unpacks and interleave single-precision (32-bit) floating-point elements
1041/// from the higher half of `a` and `b`.
1042///
1043/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1044#[inline]
1045#[target_feature(enable = "sse")]
1046#[cfg_attr(test, assert_instr(unpckhps))]
1047#[stable(feature = "simd_x86", since = "1.27.0")]
1048pub fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1049    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
1050}
1051
1052/// Unpacks and interleave single-precision (32-bit) floating-point elements
1053/// from the lower half of `a` and `b`.
1054///
1055/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1056#[inline]
1057#[target_feature(enable = "sse")]
1058#[cfg_attr(test, assert_instr(unpcklps))]
1059#[stable(feature = "simd_x86", since = "1.27.0")]
1060pub fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1061    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
1062}
1063
1064/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1065/// lower half of result.
1066///
1067/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1068#[inline]
1069#[target_feature(enable = "sse")]
1070#[cfg_attr(test, assert_instr(movhlps))]
1071#[stable(feature = "simd_x86", since = "1.27.0")]
1072pub fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1073    // TODO; figure why this is a different instruction on msvc?
1074    unsafe { simd_shuffle!(a, b, [6, 7, 2, 3]) }
1075}
1076
1077/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1078/// higher half of result.
1079///
1080/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1081#[inline]
1082#[target_feature(enable = "sse")]
1083#[cfg_attr(test, assert_instr(movlhps))]
1084#[stable(feature = "simd_x86", since = "1.27.0")]
1085pub fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1086    unsafe { simd_shuffle!(a, b, [0, 1, 4, 5]) }
1087}
1088
1089/// Returns a mask of the most significant bit of each element in `a`.
1090///
1091/// The mask is stored in the 4 least significant bits of the return value.
1092/// All other bits are set to `0`.
1093///
1094/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1095#[inline]
1096#[target_feature(enable = "sse")]
1097#[cfg_attr(test, assert_instr(movmskps))]
1098#[stable(feature = "simd_x86", since = "1.27.0")]
1099pub fn _mm_movemask_ps(a: __m128) -> i32 {
1100    // Propagate the highest bit to the rest, because simd_bitmask
1101    // requires all-1 or all-0.
1102    unsafe {
1103        let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
1104        simd_bitmask::<i32x4, u8>(mask).into()
1105    }
1106}
1107
1108/// Construct a `__m128` with the lowest element read from `p` and the other
1109/// elements set to zero.
1110///
1111/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1112///
1113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1114#[inline]
1115#[target_feature(enable = "sse")]
1116#[cfg_attr(test, assert_instr(movss))]
1117#[stable(feature = "simd_x86", since = "1.27.0")]
1118pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1119    __m128([*p, 0.0, 0.0, 0.0])
1120}
1121
1122/// Construct a `__m128` by duplicating the value read from `p` into all
1123/// elements.
1124///
1125/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1126/// shuffling.
1127///
1128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1129#[inline]
1130#[target_feature(enable = "sse")]
1131#[cfg_attr(test, assert_instr(movss))]
1132#[stable(feature = "simd_x86", since = "1.27.0")]
1133pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1134    let a = *p;
1135    __m128([a, a, a, a])
1136}
1137
1138/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1139///
1140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1141#[inline]
1142#[target_feature(enable = "sse")]
1143#[cfg_attr(test, assert_instr(movss))]
1144#[stable(feature = "simd_x86", since = "1.27.0")]
1145pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1146    _mm_load1_ps(p)
1147}
1148
1149/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1150/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1151/// protection fault will be triggered (fatal program crash).
1152///
1153/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1154/// memory.
1155///
1156/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1159#[inline]
1160#[target_feature(enable = "sse")]
1161// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261
1162// All aligned load/store intrinsics are affected
1163#[cfg_attr(
1164    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1165    assert_instr(movaps)
1166)]
1167#[stable(feature = "simd_x86", since = "1.27.0")]
1168#[allow(clippy::cast_ptr_alignment)]
1169pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1170    *(p as *const __m128)
1171}
1172
1173/// Loads four `f32` values from memory into a `__m128`. There are no
1174/// restrictions
1175/// on memory alignment. For aligned memory
1176/// [`_mm_load_ps`](fn._mm_load_ps.html)
1177/// may be faster.
1178///
1179/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1180///
1181/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1182#[inline]
1183#[target_feature(enable = "sse")]
1184#[cfg_attr(test, assert_instr(movups))]
1185#[stable(feature = "simd_x86", since = "1.27.0")]
1186pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1187    // Note: Using `*p` would require `f32` alignment, but `movups` has no
1188    // alignment restrictions.
1189    let mut dst = _mm_undefined_ps();
1190    ptr::copy_nonoverlapping(
1191        p as *const u8,
1192        ptr::addr_of_mut!(dst) as *mut u8,
1193        mem::size_of::<__m128>(),
1194    );
1195    dst
1196}
1197
1198/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1199/// order.
1200///
1201/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1202/// protection fault will be triggered (fatal program crash).
1203///
1204/// Functionally equivalent to the following code sequence (assuming `p`
1205/// satisfies the alignment restrictions):
1206///
1207/// ```text
1208/// let a0 = *p;
1209/// let a1 = *p.add(1);
1210/// let a2 = *p.add(2);
1211/// let a3 = *p.add(3);
1212/// __m128::new(a3, a2, a1, a0)
1213/// ```
1214///
1215/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1216/// shuffling.
1217///
1218/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1219#[inline]
1220#[target_feature(enable = "sse")]
1221#[cfg_attr(
1222    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1223    assert_instr(movaps)
1224)]
1225#[stable(feature = "simd_x86", since = "1.27.0")]
1226pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1227    let a = _mm_load_ps(p);
1228    simd_shuffle!(a, a, [3, 2, 1, 0])
1229}
1230
1231/// Stores the lowest 32 bit float of `a` into memory.
1232///
1233/// This intrinsic corresponds to the `MOVSS` instruction.
1234///
1235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1236#[inline]
1237#[target_feature(enable = "sse")]
1238#[cfg_attr(test, assert_instr(movss))]
1239#[stable(feature = "simd_x86", since = "1.27.0")]
1240pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1241    *p = simd_extract!(a, 0);
1242}
1243
1244/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1245/// memory.
1246///
1247/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1248/// protection fault will be triggered (fatal program crash).
1249///
1250/// Functionally equivalent to the following code sequence (assuming `p`
1251/// satisfies the alignment restrictions):
1252///
1253/// ```text
1254/// let x = a.extract(0);
1255/// *p = x;
1256/// *p.add(1) = x;
1257/// *p.add(2) = x;
1258/// *p.add(3) = x;
1259/// ```
1260///
1261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1262#[inline]
1263#[target_feature(enable = "sse")]
1264#[cfg_attr(
1265    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1266    assert_instr(movaps)
1267)]
1268#[stable(feature = "simd_x86", since = "1.27.0")]
1269#[allow(clippy::cast_ptr_alignment)]
1270pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1271    let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
1272    *(p as *mut __m128) = b;
1273}
1274
1275/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1276///
1277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1278#[inline]
1279#[target_feature(enable = "sse")]
1280#[cfg_attr(
1281    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1282    assert_instr(movaps)
1283)]
1284#[stable(feature = "simd_x86", since = "1.27.0")]
1285pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1286    _mm_store1_ps(p, a);
1287}
1288
1289/// Stores four 32-bit floats into *aligned* memory.
1290///
1291/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1292/// protection fault will be triggered (fatal program crash).
1293///
1294/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1295/// memory.
1296///
1297/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1298///
1299/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1300#[inline]
1301#[target_feature(enable = "sse")]
1302#[cfg_attr(
1303    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1304    assert_instr(movaps)
1305)]
1306#[stable(feature = "simd_x86", since = "1.27.0")]
1307#[allow(clippy::cast_ptr_alignment)]
1308pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1309    *(p as *mut __m128) = a;
1310}
1311
1312/// Stores four 32-bit floats into memory. There are no restrictions on memory
1313/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1314/// faster.
1315///
1316/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1317///
1318/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1319#[inline]
1320#[target_feature(enable = "sse")]
1321#[cfg_attr(test, assert_instr(movups))]
1322#[stable(feature = "simd_x86", since = "1.27.0")]
1323pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1324    ptr::copy_nonoverlapping(
1325        ptr::addr_of!(a) as *const u8,
1326        p as *mut u8,
1327        mem::size_of::<__m128>(),
1328    );
1329}
1330
1331/// Stores four 32-bit floats into *aligned* memory in reverse order.
1332///
1333/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1334/// protection fault will be triggered (fatal program crash).
1335///
1336/// Functionally equivalent to the following code sequence (assuming `p`
1337/// satisfies the alignment restrictions):
1338///
1339/// ```text
1340/// *p = a.extract(3);
1341/// *p.add(1) = a.extract(2);
1342/// *p.add(2) = a.extract(1);
1343/// *p.add(3) = a.extract(0);
1344/// ```
1345///
1346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1347#[inline]
1348#[target_feature(enable = "sse")]
1349#[cfg_attr(
1350    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1351    assert_instr(movaps)
1352)]
1353#[stable(feature = "simd_x86", since = "1.27.0")]
1354#[allow(clippy::cast_ptr_alignment)]
1355pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1356    let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
1357    *(p as *mut __m128) = b;
1358}
1359
1360/// Returns a `__m128` with the first component from `b` and the remaining
1361/// components from `a`.
1362///
1363/// In other words for any `a` and `b`:
1364/// ```text
1365/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1366/// ```
1367///
1368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1369#[inline]
1370#[target_feature(enable = "sse")]
1371#[cfg_attr(test, assert_instr(movss))]
1372#[stable(feature = "simd_x86", since = "1.27.0")]
1373pub fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1374    unsafe { simd_shuffle!(a, b, [4, 1, 2, 3]) }
1375}
1376
1377/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1378/// were issued by the current thread prior to this instruction.
1379///
1380/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
1381/// ordered before any load or store instruction which follows the fence in
1382/// synchronization order.
1383///
1384/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1385/// (but note that Intel is only documenting the hardware-level concerns related to this
1386/// instruction; the Intel documentation does not take into account the extra concerns that arise
1387/// because the Rust memory model is different from the x86 memory model.)
1388///
1389/// # Safety of non-temporal stores
1390///
1391/// After using any non-temporal store intrinsic, but before any other access to the memory that the
1392/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1393/// intrinsic.
1394///
1395/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1396/// memory model, these stores are happening asynchronously in a background thread. This means a
1397/// non-temporal store can cause data races with other accesses, even other accesses on the same
1398/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1399/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1400/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1401/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1402/// with all the non-temporal stores previously started on this thread, which means in particular
1403/// that subsequent synchronization with other threads will then work as intended again.
1404///
1405/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1406/// code jumps back to code outside your library. This ensures all stores inside your function
1407/// are synchronized-before the return, and thus transitively synchronized-before everything
1408/// the caller does after your function returns.
1409//
1410// The following is not a doc comment since it's not clear whether we want to put this into the
1411// docs, but it should be written out somewhere.
1412//
1413// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1414// inspect, and that behave like the following functions. This explains where the docs above come
1415// from.
1416// ```
1417// #[thread_local]
1418// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1419//
1420// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1421//     PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1422//     // Spawn a thread that will eventually do our write.
1423//     // We need to fetch a pointer to this thread's pending-write
1424//     // counter, so that we can access it from the background thread.
1425//     let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1426//     // If this was actual Rust code we'd have to do some extra work
1427//     // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1428//     std::thread::spawn(move || {
1429//         // Do the write in the background thread.
1430//         ptr.write(val);
1431//         // Register the write as done. Crucially, this is `Release`, so it
1432//         // syncs-with the `Acquire in `sfence`.
1433//         (&*pending_writes).fetch_sub(1, Release);
1434//     });
1435// }
1436//
1437// pub fn sfence() {
1438//     unsafe {
1439//         // Wait until there are no more pending writes.
1440//         while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1441//     }
1442// }
1443// ```
1444#[inline]
1445#[target_feature(enable = "sse")]
1446#[cfg_attr(test, assert_instr(sfence))]
1447#[stable(feature = "simd_x86", since = "1.27.0")]
1448pub fn _mm_sfence() {
1449    unsafe { sfence() }
1450}
1451
1452/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1453///
1454/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1455/// floating-point operations may or may not result in this register getting updated with exception
1456/// state, and the register can change between two invocations of this function even when no
1457/// floating-point operations appear in the source code (since floating-point operations appearing
1458/// earlier or later can be reordered).
1459///
1460/// If you need to perform some floating-point operations and check whether they raised an
1461/// exception, use an inline assembly block for the entire sequence of operations.
1462///
1463/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1464///
1465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1466#[inline]
1467#[target_feature(enable = "sse")]
1468#[cfg_attr(test, assert_instr(stmxcsr))]
1469#[stable(feature = "simd_x86", since = "1.27.0")]
1470#[deprecated(
1471    since = "1.75.0",
1472    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1473)]
1474pub unsafe fn _mm_getcsr() -> u32 {
1475    unsafe {
1476        let mut result = 0_i32;
1477        stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
1478        result as u32
1479    }
1480}
1481
1482/// Sets the MXCSR register with the 32-bit unsigned integer value.
1483///
1484/// This register controls how SIMD instructions handle floating point
1485/// operations. Modifying this register only affects the current thread.
1486///
1487/// It contains several groups of flags:
1488///
1489/// * *Exception flags* report which exceptions occurred since last they were reset.
1490///
1491/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
1492///   these flags are all set to 1, so all exceptions are masked. When
1493///   an exception is masked, the processor simply sets the exception flag and
1494///   continues the operation. If the exception is unmasked, the flag is also set
1495///   but additionally an exception handler is invoked.
1496///
1497/// * *Rounding mode flags* control the rounding mode of floating point
1498///   instructions.
1499///
1500/// * The *denormals-are-zero mode flag* turns all numbers which would be
1501///   denormalized (exponent bits are all zeros) into zeros.
1502///
1503/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1504/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
1505/// will optimize accordingly. This even applies when the register is altered and later reset to its
1506/// original value without any floating-point operations appearing in the source code between those
1507/// operations (since floating-point operations appearing earlier or later can be reordered).
1508///
1509/// If you need to perform some floating-point operations under a different masking flags, rounding
1510/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1511/// original MXCSR register state before the end of the block.
1512///
1513/// ## Exception Flags
1514///
1515/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1516///   Infinity by Infinity).
1517///
1518/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1519///   number. Mainly this can cause loss of precision.
1520///
1521/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1522///
1523/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1524///   result was too large to be represented (e.g., an `f32` with absolute
1525///   value greater than `2^128`).
1526///
1527/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1528///   result was too small to be represented in a normalized way (e.g., an
1529///   `f32` with absolute value smaller than `2^-126`.)
1530///
1531/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1532///   precision exception). This means some precision was lost due to rounding.
1533///   For example, the fraction `1/3` cannot be represented accurately in a
1534///   32 or 64 bit float and computing it would cause this exception to be
1535///   raised. Precision exceptions are very common, so they are usually masked.
1536///
1537/// Exception flags can be read and set using the convenience functions
1538/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1539/// check if an operation caused some overflow:
1540///
1541/// ```rust,ignore
1542/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1543///                             // perform calculations
1544/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1545///     // handle overflow
1546/// }
1547/// ```
1548///
1549/// ## Masking Flags
1550///
1551/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1552/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1553/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1554///
1555/// A single masking bit can be set via
1556///
1557/// ```rust,ignore
1558/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1559/// ```
1560///
1561/// However, since mask bits are by default all set to 1, it is more common to
1562/// want to *disable* certain bits. For example, to unmask the underflow
1563/// exception, use:
1564///
1565/// ```rust,ignore
1566/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1567/// exception
1568/// ```
1569///
1570/// Warning: an unmasked exception will cause an exception handler to be
1571/// called.
1572/// The standard handler will simply terminate the process. So, in this case
1573/// any underflow exception would terminate the current process with something
1574/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1575///
1576/// ## Rounding Mode
1577///
1578/// The rounding mode is describe using two bits. It can be read and set using
1579/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1580/// `_MM_SET_ROUNDING_MODE(mode)`.
1581///
1582/// The rounding modes are:
1583///
1584/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1585///   value. If two values are equally close, round to even (i.e., least
1586///   significant bit will be zero).
1587///
1588/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1589///
1590/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1591///
1592/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1593///
1594/// Example:
1595///
1596/// ```rust,ignore
1597/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1598/// ```
1599///
1600/// ## Denormals-are-zero/Flush-to-zero Mode
1601///
1602/// If this bit is set, values that would be denormalized will be set to zero
1603/// instead. This is turned off by default.
1604///
1605/// You can read and enable/disable this mode via the helper functions
1606/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1607///
1608/// ```rust,ignore
1609/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1610/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1611/// ```
1612///
1613///
1614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1615#[inline]
1616#[target_feature(enable = "sse")]
1617#[cfg_attr(test, assert_instr(ldmxcsr))]
1618#[stable(feature = "simd_x86", since = "1.27.0")]
1619#[deprecated(
1620    since = "1.75.0",
1621    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1622)]
1623pub unsafe fn _mm_setcsr(val: u32) {
1624    ldmxcsr(ptr::addr_of!(val) as *const i8);
1625}
1626
1627/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1628#[stable(feature = "simd_x86", since = "1.27.0")]
1629pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1630/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1631#[stable(feature = "simd_x86", since = "1.27.0")]
1632pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1633/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1636/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1637#[stable(feature = "simd_x86", since = "1.27.0")]
1638pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1639/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1640#[stable(feature = "simd_x86", since = "1.27.0")]
1641pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1642/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1643#[stable(feature = "simd_x86", since = "1.27.0")]
1644pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1645/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1646#[stable(feature = "simd_x86", since = "1.27.0")]
1647pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1648
1649/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1650#[stable(feature = "simd_x86", since = "1.27.0")]
1651pub const _MM_MASK_INVALID: u32 = 0x0080;
1652/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1653#[stable(feature = "simd_x86", since = "1.27.0")]
1654pub const _MM_MASK_DENORM: u32 = 0x0100;
1655/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1656#[stable(feature = "simd_x86", since = "1.27.0")]
1657pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1658/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1659#[stable(feature = "simd_x86", since = "1.27.0")]
1660pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1661/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1662#[stable(feature = "simd_x86", since = "1.27.0")]
1663pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1664/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1665#[stable(feature = "simd_x86", since = "1.27.0")]
1666pub const _MM_MASK_INEXACT: u32 = 0x1000;
1667/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1668#[stable(feature = "simd_x86", since = "1.27.0")]
1669pub const _MM_MASK_MASK: u32 = 0x1f80;
1670
1671/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1672#[stable(feature = "simd_x86", since = "1.27.0")]
1673pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1674/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1675#[stable(feature = "simd_x86", since = "1.27.0")]
1676pub const _MM_ROUND_DOWN: u32 = 0x2000;
1677/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1678#[stable(feature = "simd_x86", since = "1.27.0")]
1679pub const _MM_ROUND_UP: u32 = 0x4000;
1680/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1681#[stable(feature = "simd_x86", since = "1.27.0")]
1682pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1683
1684/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686pub const _MM_ROUND_MASK: u32 = 0x6000;
1687
1688/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1689#[stable(feature = "simd_x86", since = "1.27.0")]
1690pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1691/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1692#[stable(feature = "simd_x86", since = "1.27.0")]
1693pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1694/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1695#[stable(feature = "simd_x86", since = "1.27.0")]
1696pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1697
1698/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1699///
1700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1701#[inline]
1702#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1703#[allow(non_snake_case)]
1704#[target_feature(enable = "sse")]
1705#[stable(feature = "simd_x86", since = "1.27.0")]
1706#[deprecated(
1707    since = "1.75.0",
1708    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1709)]
1710pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1711    _mm_getcsr() & _MM_MASK_MASK
1712}
1713
1714/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1715///
1716/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1717#[inline]
1718#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1719#[allow(non_snake_case)]
1720#[target_feature(enable = "sse")]
1721#[stable(feature = "simd_x86", since = "1.27.0")]
1722#[deprecated(
1723    since = "1.75.0",
1724    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1725)]
1726pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1727    _mm_getcsr() & _MM_EXCEPT_MASK
1728}
1729
1730/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1731///
1732/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1733#[inline]
1734#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1735#[allow(non_snake_case)]
1736#[target_feature(enable = "sse")]
1737#[stable(feature = "simd_x86", since = "1.27.0")]
1738#[deprecated(
1739    since = "1.75.0",
1740    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1741)]
1742pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1743    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1744}
1745
1746/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1747///
1748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1749#[inline]
1750#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1751#[allow(non_snake_case)]
1752#[target_feature(enable = "sse")]
1753#[stable(feature = "simd_x86", since = "1.27.0")]
1754#[deprecated(
1755    since = "1.75.0",
1756    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1757)]
1758pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1759    _mm_getcsr() & _MM_ROUND_MASK
1760}
1761
1762/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1763///
1764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1765#[inline]
1766#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1767#[allow(non_snake_case)]
1768#[target_feature(enable = "sse")]
1769#[stable(feature = "simd_x86", since = "1.27.0")]
1770#[deprecated(
1771    since = "1.75.0",
1772    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1773)]
1774pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1775    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | (x & _MM_MASK_MASK))
1776}
1777
1778/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1779///
1780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1781#[inline]
1782#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1783#[allow(non_snake_case)]
1784#[target_feature(enable = "sse")]
1785#[stable(feature = "simd_x86", since = "1.27.0")]
1786#[deprecated(
1787    since = "1.75.0",
1788    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1789)]
1790pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1791    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | (x & _MM_EXCEPT_MASK))
1792}
1793
1794/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1795///
1796/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1797#[inline]
1798#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1799#[allow(non_snake_case)]
1800#[target_feature(enable = "sse")]
1801#[stable(feature = "simd_x86", since = "1.27.0")]
1802#[deprecated(
1803    since = "1.75.0",
1804    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1805)]
1806pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1807    _mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | (x & _MM_FLUSH_ZERO_MASK))
1808}
1809
1810/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1811///
1812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1813#[inline]
1814#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1815#[allow(non_snake_case)]
1816#[target_feature(enable = "sse")]
1817#[stable(feature = "simd_x86", since = "1.27.0")]
1818#[deprecated(
1819    since = "1.75.0",
1820    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1821)]
1822pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1823    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | (x & _MM_ROUND_MASK))
1824}
1825
1826/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1827#[stable(feature = "simd_x86", since = "1.27.0")]
1828pub const _MM_HINT_T0: i32 = 3;
1829
1830/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1831#[stable(feature = "simd_x86", since = "1.27.0")]
1832pub const _MM_HINT_T1: i32 = 2;
1833
1834/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1835#[stable(feature = "simd_x86", since = "1.27.0")]
1836pub const _MM_HINT_T2: i32 = 1;
1837
1838/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1839#[stable(feature = "simd_x86", since = "1.27.0")]
1840pub const _MM_HINT_NTA: i32 = 0;
1841
1842/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1843#[stable(feature = "simd_x86", since = "1.27.0")]
1844pub const _MM_HINT_ET0: i32 = 7;
1845
1846/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1847#[stable(feature = "simd_x86", since = "1.27.0")]
1848pub const _MM_HINT_ET1: i32 = 6;
1849
1850/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1851///
1852/// The `STRATEGY` must be one of:
1853///
1854/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1855///   cache hierarchy.
1856///
1857/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1858///
1859/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1860///   an implementation-specific choice (e.g., L2 if there is no L3).
1861///
1862/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1863///   non-temporal access (NTA) hint. It may be a place closer than main memory
1864///   but outside of the cache hierarchy. This is used to reduce access latency
1865///   without polluting the cache.
1866///
1867/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1868///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1869///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1870///
1871/// The actual implementation depends on the particular CPU. This instruction
1872/// is considered a hint, so the CPU is also free to simply ignore the request.
1873///
1874/// The amount of prefetched data depends on the cache line size of the
1875/// specific CPU, but it will be at least 32 bytes.
1876///
1877/// Common caveats:
1878///
1879/// * Most modern CPUs already automatically prefetch data based on predicted
1880///   access patterns.
1881///
1882/// * Data is usually not fetched if this would cause a TLB miss or a page
1883///   fault.
1884///
1885/// * Too much prefetching can cause unnecessary cache evictions.
1886///
1887/// * Prefetching may also fail if there are not enough memory-subsystem
1888///   resources (e.g., request buffers).
1889///
1890/// Note: this intrinsic is safe to use even though it takes a raw pointer argument. In general, this
1891/// cannot change the behavior of the program, including not trapping on invalid pointers.
1892///
1893/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1894#[inline]
1895#[target_feature(enable = "sse")]
1896#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1897#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1898#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1899#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1900#[rustc_legacy_const_generics(1)]
1901#[stable(feature = "simd_x86", since = "1.27.0")]
1902pub fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1903    static_assert_uimm_bits!(STRATEGY, 3);
1904    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1905    // `locality` and `rw` are based on our `STRATEGY`.
1906    unsafe {
1907        prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
1908    }
1909}
1910
1911/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
1912/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
1913/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
1914/// In practice, this is typically equivalent to [`mem::zeroed`].
1915///
1916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1917#[inline]
1918#[target_feature(enable = "sse")]
1919#[stable(feature = "simd_x86", since = "1.27.0")]
1920pub fn _mm_undefined_ps() -> __m128 {
1921    const { unsafe { mem::zeroed() } }
1922}
1923
1924/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1925///
1926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1927#[inline]
1928#[allow(non_snake_case)]
1929#[target_feature(enable = "sse")]
1930#[stable(feature = "simd_x86", since = "1.27.0")]
1931pub fn _MM_TRANSPOSE4_PS(
1932    row0: &mut __m128,
1933    row1: &mut __m128,
1934    row2: &mut __m128,
1935    row3: &mut __m128,
1936) {
1937    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
1938    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
1939    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
1940    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
1941
1942    *row0 = _mm_movelh_ps(tmp0, tmp2);
1943    *row1 = _mm_movehl_ps(tmp2, tmp0);
1944    *row2 = _mm_movelh_ps(tmp1, tmp3);
1945    *row3 = _mm_movehl_ps(tmp3, tmp1);
1946}
1947
1948#[allow(improper_ctypes)]
1949unsafe extern "C" {
1950    #[link_name = "llvm.x86.sse.rcp.ss"]
1951    fn rcpss(a: __m128) -> __m128;
1952    #[link_name = "llvm.x86.sse.rcp.ps"]
1953    fn rcpps(a: __m128) -> __m128;
1954    #[link_name = "llvm.x86.sse.rsqrt.ss"]
1955    fn rsqrtss(a: __m128) -> __m128;
1956    #[link_name = "llvm.x86.sse.rsqrt.ps"]
1957    fn rsqrtps(a: __m128) -> __m128;
1958    #[link_name = "llvm.x86.sse.min.ss"]
1959    fn minss(a: __m128, b: __m128) -> __m128;
1960    #[link_name = "llvm.x86.sse.min.ps"]
1961    fn minps(a: __m128, b: __m128) -> __m128;
1962    #[link_name = "llvm.x86.sse.max.ss"]
1963    fn maxss(a: __m128, b: __m128) -> __m128;
1964    #[link_name = "llvm.x86.sse.max.ps"]
1965    fn maxps(a: __m128, b: __m128) -> __m128;
1966    #[link_name = "llvm.x86.sse.cmp.ps"]
1967    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1968    #[link_name = "llvm.x86.sse.comieq.ss"]
1969    fn comieq_ss(a: __m128, b: __m128) -> i32;
1970    #[link_name = "llvm.x86.sse.comilt.ss"]
1971    fn comilt_ss(a: __m128, b: __m128) -> i32;
1972    #[link_name = "llvm.x86.sse.comile.ss"]
1973    fn comile_ss(a: __m128, b: __m128) -> i32;
1974    #[link_name = "llvm.x86.sse.comigt.ss"]
1975    fn comigt_ss(a: __m128, b: __m128) -> i32;
1976    #[link_name = "llvm.x86.sse.comige.ss"]
1977    fn comige_ss(a: __m128, b: __m128) -> i32;
1978    #[link_name = "llvm.x86.sse.comineq.ss"]
1979    fn comineq_ss(a: __m128, b: __m128) -> i32;
1980    #[link_name = "llvm.x86.sse.ucomieq.ss"]
1981    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1982    #[link_name = "llvm.x86.sse.ucomilt.ss"]
1983    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1984    #[link_name = "llvm.x86.sse.ucomile.ss"]
1985    fn ucomile_ss(a: __m128, b: __m128) -> i32;
1986    #[link_name = "llvm.x86.sse.ucomigt.ss"]
1987    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1988    #[link_name = "llvm.x86.sse.ucomige.ss"]
1989    fn ucomige_ss(a: __m128, b: __m128) -> i32;
1990    #[link_name = "llvm.x86.sse.ucomineq.ss"]
1991    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1992    #[link_name = "llvm.x86.sse.cvtss2si"]
1993    fn cvtss2si(a: __m128) -> i32;
1994    #[link_name = "llvm.x86.sse.cvttss2si"]
1995    fn cvttss2si(a: __m128) -> i32;
1996    #[link_name = "llvm.x86.sse.sfence"]
1997    fn sfence();
1998    #[link_name = "llvm.x86.sse.stmxcsr"]
1999    fn stmxcsr(p: *mut i8);
2000    #[link_name = "llvm.x86.sse.ldmxcsr"]
2001    fn ldmxcsr(p: *const i8);
2002    #[link_name = "llvm.prefetch"]
2003    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
2004    #[link_name = "llvm.x86.sse.cmp.ss"]
2005    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
2006}
2007
2008/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
2009///
2010/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2011/// exception _may_ be generated.
2012///
2013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
2014///
2015/// # Safety of non-temporal stores
2016///
2017/// After using this intrinsic, but before any other access to the memory that this intrinsic
2018/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2019/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2020/// return.
2021///
2022/// See [`_mm_sfence`] for details.
2023#[inline]
2024#[target_feature(enable = "sse")]
2025#[cfg_attr(test, assert_instr(movntps))]
2026#[stable(feature = "simd_x86", since = "1.27.0")]
2027#[allow(clippy::cast_ptr_alignment)]
2028pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
2029    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2030    crate::arch::asm!(
2031        vps!("movntps", ",{a}"),
2032        p = in(reg) mem_addr,
2033        a = in(xmm_reg) a,
2034        options(nostack, preserves_flags),
2035    );
2036}
2037
2038#[cfg(test)]
2039mod tests {
2040    use crate::{hint::black_box, mem::transmute, ptr};
2041    use std::boxed;
2042    use stdarch_test::simd_test;
2043
2044    use crate::core_arch::{simd::*, x86::*};
2045
2046    const NAN: f32 = f32::NAN;
2047
2048    #[simd_test(enable = "sse")]
2049    unsafe fn test_mm_add_ps() {
2050        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2051        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2052        let r = _mm_add_ps(a, b);
2053        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
2054    }
2055
2056    #[simd_test(enable = "sse")]
2057    unsafe fn test_mm_add_ss() {
2058        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
2059        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
2060        let r = _mm_add_ss(a, b);
2061        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
2062    }
2063
2064    #[simd_test(enable = "sse")]
2065    unsafe fn test_mm_sub_ps() {
2066        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2067        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2068        let r = _mm_sub_ps(a, b);
2069        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
2070    }
2071
2072    #[simd_test(enable = "sse")]
2073    unsafe fn test_mm_sub_ss() {
2074        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2075        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2076        let r = _mm_sub_ss(a, b);
2077        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
2078    }
2079
2080    #[simd_test(enable = "sse")]
2081    unsafe fn test_mm_mul_ps() {
2082        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2083        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2084        let r = _mm_mul_ps(a, b);
2085        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
2086    }
2087
2088    #[simd_test(enable = "sse")]
2089    unsafe fn test_mm_mul_ss() {
2090        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2091        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2092        let r = _mm_mul_ss(a, b);
2093        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
2094    }
2095
2096    #[simd_test(enable = "sse")]
2097    unsafe fn test_mm_div_ps() {
2098        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
2099        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
2100        let r = _mm_div_ps(a, b);
2101        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
2102    }
2103
2104    #[simd_test(enable = "sse")]
2105    unsafe fn test_mm_div_ss() {
2106        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2107        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2108        let r = _mm_div_ss(a, b);
2109        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
2110    }
2111
2112    #[simd_test(enable = "sse")]
2113    unsafe fn test_mm_sqrt_ss() {
2114        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2115        let r = _mm_sqrt_ss(a);
2116        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
2117        assert_eq_m128(r, e);
2118    }
2119
2120    #[simd_test(enable = "sse")]
2121    unsafe fn test_mm_sqrt_ps() {
2122        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2123        let r = _mm_sqrt_ps(a);
2124        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
2125        assert_eq_m128(r, e);
2126    }
2127
2128    #[simd_test(enable = "sse")]
2129    unsafe fn test_mm_rcp_ss() {
2130        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2131        let r = _mm_rcp_ss(a);
2132        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
2133        let rel_err = 0.00048828125;
2134        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
2135        for i in 1..4 {
2136            assert_eq!(get_m128(r, i), get_m128(e, i));
2137        }
2138    }
2139
2140    #[simd_test(enable = "sse")]
2141    unsafe fn test_mm_rcp_ps() {
2142        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2143        let r = _mm_rcp_ps(a);
2144        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
2145        let rel_err = 0.00048828125;
2146        for i in 0..4 {
2147            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2148        }
2149    }
2150
2151    #[simd_test(enable = "sse")]
2152    unsafe fn test_mm_rsqrt_ss() {
2153        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2154        let r = _mm_rsqrt_ss(a);
2155        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
2156        let rel_err = 0.00048828125;
2157        for i in 0..4 {
2158            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2159        }
2160    }
2161
2162    #[simd_test(enable = "sse")]
2163    unsafe fn test_mm_rsqrt_ps() {
2164        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2165        let r = _mm_rsqrt_ps(a);
2166        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2167        let rel_err = 0.00048828125;
2168        for i in 0..4 {
2169            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2170        }
2171    }
2172
2173    #[simd_test(enable = "sse")]
2174    unsafe fn test_mm_min_ss() {
2175        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2176        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2177        let r = _mm_min_ss(a, b);
2178        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2179    }
2180
2181    #[simd_test(enable = "sse")]
2182    unsafe fn test_mm_min_ps() {
2183        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2184        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2185        let r = _mm_min_ps(a, b);
2186        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2187
2188        // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2189        // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2190        // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2191        // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2192        // `r1` to `a` and `r2` to `b`.
2193        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2194        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2195        let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2196        let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2197        let a: [u8; 16] = transmute(a);
2198        let b: [u8; 16] = transmute(b);
2199        assert_eq!(r1, b);
2200        assert_eq!(r2, a);
2201        assert_ne!(a, b); // sanity check that -0.0 is actually present
2202    }
2203
2204    #[simd_test(enable = "sse")]
2205    unsafe fn test_mm_max_ss() {
2206        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2207        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2208        let r = _mm_max_ss(a, b);
2209        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2210    }
2211
2212    #[simd_test(enable = "sse")]
2213    unsafe fn test_mm_max_ps() {
2214        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2215        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2216        let r = _mm_max_ps(a, b);
2217        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2218
2219        // Check SSE-specific semantics for -0.0 handling.
2220        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2221        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2222        let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
2223        let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
2224        let a: [u8; 16] = transmute(a);
2225        let b: [u8; 16] = transmute(b);
2226        assert_eq!(r1, b);
2227        assert_eq!(r2, a);
2228        assert_ne!(a, b); // sanity check that -0.0 is actually present
2229    }
2230
2231    #[simd_test(enable = "sse")]
2232    unsafe fn test_mm_and_ps() {
2233        let a = transmute(u32x4::splat(0b0011));
2234        let b = transmute(u32x4::splat(0b0101));
2235        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2236        let e = transmute(u32x4::splat(0b0001));
2237        assert_eq_m128(r, e);
2238    }
2239
2240    #[simd_test(enable = "sse")]
2241    unsafe fn test_mm_andnot_ps() {
2242        let a = transmute(u32x4::splat(0b0011));
2243        let b = transmute(u32x4::splat(0b0101));
2244        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2245        let e = transmute(u32x4::splat(0b0100));
2246        assert_eq_m128(r, e);
2247    }
2248
2249    #[simd_test(enable = "sse")]
2250    unsafe fn test_mm_or_ps() {
2251        let a = transmute(u32x4::splat(0b0011));
2252        let b = transmute(u32x4::splat(0b0101));
2253        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2254        let e = transmute(u32x4::splat(0b0111));
2255        assert_eq_m128(r, e);
2256    }
2257
2258    #[simd_test(enable = "sse")]
2259    unsafe fn test_mm_xor_ps() {
2260        let a = transmute(u32x4::splat(0b0011));
2261        let b = transmute(u32x4::splat(0b0101));
2262        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2263        let e = transmute(u32x4::splat(0b0110));
2264        assert_eq_m128(r, e);
2265    }
2266
2267    #[simd_test(enable = "sse")]
2268    unsafe fn test_mm_cmpeq_ss() {
2269        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2270        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2271        let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2272        let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
2273        assert_eq!(r, e);
2274
2275        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2276        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2277        let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
2278        assert_eq!(r2, e2);
2279    }
2280
2281    #[simd_test(enable = "sse")]
2282    unsafe fn test_mm_cmplt_ss() {
2283        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2284        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2285        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2286        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2287
2288        let b1 = 0u32; // a.extract(0) < b.extract(0)
2289        let c1 = 0u32; // a.extract(0) < c.extract(0)
2290        let d1 = !0u32; // a.extract(0) < d.extract(0)
2291
2292        let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2293        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2294        assert_eq!(rb, eb);
2295
2296        let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2297        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2298        assert_eq!(rc, ec);
2299
2300        let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2301        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2302        assert_eq!(rd, ed);
2303    }
2304
2305    #[simd_test(enable = "sse")]
2306    unsafe fn test_mm_cmple_ss() {
2307        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2308        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2309        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2310        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2311
2312        let b1 = 0u32; // a.extract(0) <= b.extract(0)
2313        let c1 = !0u32; // a.extract(0) <= c.extract(0)
2314        let d1 = !0u32; // a.extract(0) <= d.extract(0)
2315
2316        let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2317        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2318        assert_eq!(rb, eb);
2319
2320        let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2321        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2322        assert_eq!(rc, ec);
2323
2324        let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2325        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2326        assert_eq!(rd, ed);
2327    }
2328
2329    #[simd_test(enable = "sse")]
2330    unsafe fn test_mm_cmpgt_ss() {
2331        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2332        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2333        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2334        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2335
2336        let b1 = !0u32; // a.extract(0) > b.extract(0)
2337        let c1 = 0u32; // a.extract(0) > c.extract(0)
2338        let d1 = 0u32; // a.extract(0) > d.extract(0)
2339
2340        let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2341        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2342        assert_eq!(rb, eb);
2343
2344        let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2345        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2346        assert_eq!(rc, ec);
2347
2348        let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2349        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2350        assert_eq!(rd, ed);
2351    }
2352
2353    #[simd_test(enable = "sse")]
2354    unsafe fn test_mm_cmpge_ss() {
2355        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2356        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2357        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2358        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2359
2360        let b1 = !0u32; // a.extract(0) >= b.extract(0)
2361        let c1 = !0u32; // a.extract(0) >= c.extract(0)
2362        let d1 = 0u32; // a.extract(0) >= d.extract(0)
2363
2364        let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2365        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2366        assert_eq!(rb, eb);
2367
2368        let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2369        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2370        assert_eq!(rc, ec);
2371
2372        let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2373        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2374        assert_eq!(rd, ed);
2375    }
2376
2377    #[simd_test(enable = "sse")]
2378    unsafe fn test_mm_cmpneq_ss() {
2379        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2380        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2381        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2382        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2383
2384        let b1 = !0u32; // a.extract(0) != b.extract(0)
2385        let c1 = 0u32; // a.extract(0) != c.extract(0)
2386        let d1 = !0u32; // a.extract(0) != d.extract(0)
2387
2388        let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2389        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2390        assert_eq!(rb, eb);
2391
2392        let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2393        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2394        assert_eq!(rc, ec);
2395
2396        let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2397        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2398        assert_eq!(rd, ed);
2399    }
2400
2401    #[simd_test(enable = "sse")]
2402    unsafe fn test_mm_cmpnlt_ss() {
2403        // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2404        // must be a difference. It may have to do with behavior in the
2405        // presence of NaNs (signaling or quiet). If so, we should add tests
2406        // for those.
2407
2408        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2409        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2410        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2411        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2412
2413        let b1 = !0u32; // a.extract(0) >= b.extract(0)
2414        let c1 = !0u32; // a.extract(0) >= c.extract(0)
2415        let d1 = 0u32; // a.extract(0) >= d.extract(0)
2416
2417        let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2418        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2419        assert_eq!(rb, eb);
2420
2421        let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2422        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2423        assert_eq!(rc, ec);
2424
2425        let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2426        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2427        assert_eq!(rd, ed);
2428    }
2429
2430    #[simd_test(enable = "sse")]
2431    unsafe fn test_mm_cmpnle_ss() {
2432        // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2433        // must be a difference. It may have to do with behavior in the
2434        // presence
2435        // of NaNs (signaling or quiet). If so, we should add tests for those.
2436
2437        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2438        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2439        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2440        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2441
2442        let b1 = !0u32; // a.extract(0) > b.extract(0)
2443        let c1 = 0u32; // a.extract(0) > c.extract(0)
2444        let d1 = 0u32; // a.extract(0) > d.extract(0)
2445
2446        let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2447        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2448        assert_eq!(rb, eb);
2449
2450        let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2451        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2452        assert_eq!(rc, ec);
2453
2454        let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2455        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2456        assert_eq!(rd, ed);
2457    }
2458
2459    #[simd_test(enable = "sse")]
2460    unsafe fn test_mm_cmpngt_ss() {
2461        // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2462        // must be a difference. It may have to do with behavior in the
2463        // presence of NaNs (signaling or quiet). If so, we should add tests
2464        // for those.
2465
2466        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2467        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2468        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2469        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2470
2471        let b1 = 0u32; // a.extract(0) <= b.extract(0)
2472        let c1 = !0u32; // a.extract(0) <= c.extract(0)
2473        let d1 = !0u32; // a.extract(0) <= d.extract(0)
2474
2475        let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2476        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2477        assert_eq!(rb, eb);
2478
2479        let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2480        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2481        assert_eq!(rc, ec);
2482
2483        let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2484        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2485        assert_eq!(rd, ed);
2486    }
2487
2488    #[simd_test(enable = "sse")]
2489    unsafe fn test_mm_cmpnge_ss() {
2490        // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2491        // must be a difference. It may have to do with behavior in the
2492        // presence of NaNs (signaling or quiet). If so, we should add tests
2493        // for those.
2494
2495        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2496        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2497        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2498        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2499
2500        let b1 = 0u32; // a.extract(0) < b.extract(0)
2501        let c1 = 0u32; // a.extract(0) < c.extract(0)
2502        let d1 = !0u32; // a.extract(0) < d.extract(0)
2503
2504        let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2505        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2506        assert_eq!(rb, eb);
2507
2508        let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2509        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2510        assert_eq!(rc, ec);
2511
2512        let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2513        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2514        assert_eq!(rd, ed);
2515    }
2516
2517    #[simd_test(enable = "sse")]
2518    unsafe fn test_mm_cmpord_ss() {
2519        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2520        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2521        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2522        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2523
2524        let b1 = !0u32; // a.extract(0) ord b.extract(0)
2525        let c1 = 0u32; // a.extract(0) ord c.extract(0)
2526        let d1 = !0u32; // a.extract(0) ord d.extract(0)
2527
2528        let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2529        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2530        assert_eq!(rb, eb);
2531
2532        let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2533        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2534        assert_eq!(rc, ec);
2535
2536        let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2537        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2538        assert_eq!(rd, ed);
2539    }
2540
2541    #[simd_test(enable = "sse")]
2542    unsafe fn test_mm_cmpunord_ss() {
2543        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2544        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2545        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2546        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2547
2548        let b1 = 0u32; // a.extract(0) unord b.extract(0)
2549        let c1 = !0u32; // a.extract(0) unord c.extract(0)
2550        let d1 = 0u32; // a.extract(0) unord d.extract(0)
2551
2552        let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2553        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2554        assert_eq!(rb, eb);
2555
2556        let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2557        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2558        assert_eq!(rc, ec);
2559
2560        let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2561        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2562        assert_eq!(rd, ed);
2563    }
2564
2565    #[simd_test(enable = "sse")]
2566    unsafe fn test_mm_cmpeq_ps() {
2567        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2568        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2569        let tru = !0u32;
2570        let fls = 0u32;
2571
2572        let e = u32x4::new(fls, fls, tru, fls);
2573        let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2574        assert_eq!(r, e);
2575    }
2576
2577    #[simd_test(enable = "sse")]
2578    unsafe fn test_mm_cmplt_ps() {
2579        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2580        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2581        let tru = !0u32;
2582        let fls = 0u32;
2583
2584        let e = u32x4::new(tru, fls, fls, fls);
2585        let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2586        assert_eq!(r, e);
2587    }
2588
2589    #[simd_test(enable = "sse")]
2590    unsafe fn test_mm_cmple_ps() {
2591        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2592        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2593        let tru = !0u32;
2594        let fls = 0u32;
2595
2596        let e = u32x4::new(tru, fls, tru, fls);
2597        let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2598        assert_eq!(r, e);
2599    }
2600
2601    #[simd_test(enable = "sse")]
2602    unsafe fn test_mm_cmpgt_ps() {
2603        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2604        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2605        let tru = !0u32;
2606        let fls = 0u32;
2607
2608        let e = u32x4::new(fls, tru, fls, fls);
2609        let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2610        assert_eq!(r, e);
2611    }
2612
2613    #[simd_test(enable = "sse")]
2614    unsafe fn test_mm_cmpge_ps() {
2615        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2616        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2617        let tru = !0u32;
2618        let fls = 0u32;
2619
2620        let e = u32x4::new(fls, tru, tru, fls);
2621        let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2622        assert_eq!(r, e);
2623    }
2624
2625    #[simd_test(enable = "sse")]
2626    unsafe fn test_mm_cmpneq_ps() {
2627        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2628        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2629        let tru = !0u32;
2630        let fls = 0u32;
2631
2632        let e = u32x4::new(tru, tru, fls, tru);
2633        let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2634        assert_eq!(r, e);
2635    }
2636
2637    #[simd_test(enable = "sse")]
2638    unsafe fn test_mm_cmpnlt_ps() {
2639        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2640        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2641        let tru = !0u32;
2642        let fls = 0u32;
2643
2644        let e = u32x4::new(fls, tru, tru, tru);
2645        let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2646        assert_eq!(r, e);
2647    }
2648
2649    #[simd_test(enable = "sse")]
2650    unsafe fn test_mm_cmpnle_ps() {
2651        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2652        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2653        let tru = !0u32;
2654        let fls = 0u32;
2655
2656        let e = u32x4::new(fls, tru, fls, tru);
2657        let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2658        assert_eq!(r, e);
2659    }
2660
2661    #[simd_test(enable = "sse")]
2662    unsafe fn test_mm_cmpngt_ps() {
2663        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2664        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2665        let tru = !0u32;
2666        let fls = 0u32;
2667
2668        let e = u32x4::new(tru, fls, tru, tru);
2669        let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2670        assert_eq!(r, e);
2671    }
2672
2673    #[simd_test(enable = "sse")]
2674    unsafe fn test_mm_cmpnge_ps() {
2675        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2676        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2677        let tru = !0u32;
2678        let fls = 0u32;
2679
2680        let e = u32x4::new(tru, fls, fls, tru);
2681        let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2682        assert_eq!(r, e);
2683    }
2684
2685    #[simd_test(enable = "sse")]
2686    unsafe fn test_mm_cmpord_ps() {
2687        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2688        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2689        let tru = !0u32;
2690        let fls = 0u32;
2691
2692        let e = u32x4::new(tru, fls, fls, fls);
2693        let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2694        assert_eq!(r, e);
2695    }
2696
2697    #[simd_test(enable = "sse")]
2698    unsafe fn test_mm_cmpunord_ps() {
2699        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2700        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2701        let tru = !0u32;
2702        let fls = 0u32;
2703
2704        let e = u32x4::new(fls, tru, tru, tru);
2705        let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2706        assert_eq!(r, e);
2707    }
2708
2709    #[simd_test(enable = "sse")]
2710    unsafe fn test_mm_comieq_ss() {
2711        let aa = &[3.0f32, 12.0, 23.0, NAN];
2712        let bb = &[3.0f32, 47.5, 1.5, NAN];
2713
2714        let ee = &[1i32, 0, 0, 0];
2715
2716        for i in 0..4 {
2717            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2718            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2719
2720            let r = _mm_comieq_ss(a, b);
2721
2722            assert_eq!(
2723                ee[i], r,
2724                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2725                a, b, r, ee[i], i
2726            );
2727        }
2728    }
2729
2730    #[simd_test(enable = "sse")]
2731    unsafe fn test_mm_comilt_ss() {
2732        let aa = &[3.0f32, 12.0, 23.0, NAN];
2733        let bb = &[3.0f32, 47.5, 1.5, NAN];
2734
2735        let ee = &[0i32, 1, 0, 0];
2736
2737        for i in 0..4 {
2738            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2739            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2740
2741            let r = _mm_comilt_ss(a, b);
2742
2743            assert_eq!(
2744                ee[i], r,
2745                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2746                a, b, r, ee[i], i
2747            );
2748        }
2749    }
2750
2751    #[simd_test(enable = "sse")]
2752    unsafe fn test_mm_comile_ss() {
2753        let aa = &[3.0f32, 12.0, 23.0, NAN];
2754        let bb = &[3.0f32, 47.5, 1.5, NAN];
2755
2756        let ee = &[1i32, 1, 0, 0];
2757
2758        for i in 0..4 {
2759            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2760            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2761
2762            let r = _mm_comile_ss(a, b);
2763
2764            assert_eq!(
2765                ee[i], r,
2766                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2767                a, b, r, ee[i], i
2768            );
2769        }
2770    }
2771
2772    #[simd_test(enable = "sse")]
2773    unsafe fn test_mm_comigt_ss() {
2774        let aa = &[3.0f32, 12.0, 23.0, NAN];
2775        let bb = &[3.0f32, 47.5, 1.5, NAN];
2776
2777        let ee = &[1i32, 0, 1, 0];
2778
2779        for i in 0..4 {
2780            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2781            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2782
2783            let r = _mm_comige_ss(a, b);
2784
2785            assert_eq!(
2786                ee[i], r,
2787                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2788                a, b, r, ee[i], i
2789            );
2790        }
2791    }
2792
2793    #[simd_test(enable = "sse")]
2794    unsafe fn test_mm_comineq_ss() {
2795        let aa = &[3.0f32, 12.0, 23.0, NAN];
2796        let bb = &[3.0f32, 47.5, 1.5, NAN];
2797
2798        let ee = &[0i32, 1, 1, 1];
2799
2800        for i in 0..4 {
2801            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2802            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2803
2804            let r = _mm_comineq_ss(a, b);
2805
2806            assert_eq!(
2807                ee[i], r,
2808                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2809                a, b, r, ee[i], i
2810            );
2811        }
2812    }
2813
2814    #[simd_test(enable = "sse")]
2815    unsafe fn test_mm_ucomieq_ss() {
2816        let aa = &[3.0f32, 12.0, 23.0, NAN];
2817        let bb = &[3.0f32, 47.5, 1.5, NAN];
2818
2819        let ee = &[1i32, 0, 0, 0];
2820
2821        for i in 0..4 {
2822            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2823            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2824
2825            let r = _mm_ucomieq_ss(a, b);
2826
2827            assert_eq!(
2828                ee[i], r,
2829                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2830                a, b, r, ee[i], i
2831            );
2832        }
2833    }
2834
2835    #[simd_test(enable = "sse")]
2836    unsafe fn test_mm_ucomilt_ss() {
2837        let aa = &[3.0f32, 12.0, 23.0, NAN];
2838        let bb = &[3.0f32, 47.5, 1.5, NAN];
2839
2840        let ee = &[0i32, 1, 0, 0];
2841
2842        for i in 0..4 {
2843            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2844            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2845
2846            let r = _mm_ucomilt_ss(a, b);
2847
2848            assert_eq!(
2849                ee[i], r,
2850                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2851                a, b, r, ee[i], i
2852            );
2853        }
2854    }
2855
2856    #[simd_test(enable = "sse")]
2857    unsafe fn test_mm_ucomile_ss() {
2858        let aa = &[3.0f32, 12.0, 23.0, NAN];
2859        let bb = &[3.0f32, 47.5, 1.5, NAN];
2860
2861        let ee = &[1i32, 1, 0, 0];
2862
2863        for i in 0..4 {
2864            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2865            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2866
2867            let r = _mm_ucomile_ss(a, b);
2868
2869            assert_eq!(
2870                ee[i], r,
2871                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2872                a, b, r, ee[i], i
2873            );
2874        }
2875    }
2876
2877    #[simd_test(enable = "sse")]
2878    unsafe fn test_mm_ucomigt_ss() {
2879        let aa = &[3.0f32, 12.0, 23.0, NAN];
2880        let bb = &[3.0f32, 47.5, 1.5, NAN];
2881
2882        let ee = &[0i32, 0, 1, 0];
2883
2884        for i in 0..4 {
2885            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2886            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2887
2888            let r = _mm_ucomigt_ss(a, b);
2889
2890            assert_eq!(
2891                ee[i], r,
2892                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2893                a, b, r, ee[i], i
2894            );
2895        }
2896    }
2897
2898    #[simd_test(enable = "sse")]
2899    unsafe fn test_mm_ucomige_ss() {
2900        let aa = &[3.0f32, 12.0, 23.0, NAN];
2901        let bb = &[3.0f32, 47.5, 1.5, NAN];
2902
2903        let ee = &[1i32, 0, 1, 0];
2904
2905        for i in 0..4 {
2906            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2907            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2908
2909            let r = _mm_ucomige_ss(a, b);
2910
2911            assert_eq!(
2912                ee[i], r,
2913                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2914                a, b, r, ee[i], i
2915            );
2916        }
2917    }
2918
2919    #[simd_test(enable = "sse")]
2920    unsafe fn test_mm_ucomineq_ss() {
2921        let aa = &[3.0f32, 12.0, 23.0, NAN];
2922        let bb = &[3.0f32, 47.5, 1.5, NAN];
2923
2924        let ee = &[0i32, 1, 1, 1];
2925
2926        for i in 0..4 {
2927            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2928            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2929
2930            let r = _mm_ucomineq_ss(a, b);
2931
2932            assert_eq!(
2933                ee[i], r,
2934                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2935                a, b, r, ee[i], i
2936            );
2937        }
2938    }
2939
2940    #[simd_test(enable = "sse")]
2941    unsafe fn test_mm_cvtss_si32() {
2942        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2943        let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2944        for i in 0..inputs.len() {
2945            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2946            let e = result[i];
2947            let r = _mm_cvtss_si32(x);
2948            assert_eq!(
2949                e, r,
2950                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2951                i, x, r, e
2952            );
2953        }
2954    }
2955
2956    #[simd_test(enable = "sse")]
2957    unsafe fn test_mm_cvttss_si32() {
2958        let inputs = &[
2959            (42.0f32, 42i32),
2960            (-31.4, -31),
2961            (-33.5, -33),
2962            (-34.5, -34),
2963            (10.999, 10),
2964            (-5.99, -5),
2965            (4.0e10, i32::MIN),
2966            (4.0e-10, 0),
2967            (NAN, i32::MIN),
2968            (2147483500.1, 2147483520),
2969        ];
2970        for (i, &(xi, e)) in inputs.iter().enumerate() {
2971            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2972            let r = _mm_cvttss_si32(x);
2973            assert_eq!(
2974                e, r,
2975                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2976                i, x, r, e
2977            );
2978        }
2979    }
2980
2981    #[simd_test(enable = "sse")]
2982    unsafe fn test_mm_cvtsi32_ss() {
2983        let inputs = &[
2984            (4555i32, 4555.0f32),
2985            (322223333, 322223330.0),
2986            (-432, -432.0),
2987            (-322223333, -322223330.0),
2988        ];
2989
2990        for &(x, f) in inputs.iter() {
2991            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2992            let r = _mm_cvtsi32_ss(a, x);
2993            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
2994            assert_eq_m128(e, r);
2995        }
2996    }
2997
2998    #[simd_test(enable = "sse")]
2999    unsafe fn test_mm_cvtss_f32() {
3000        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
3001        assert_eq!(_mm_cvtss_f32(a), 312.0134);
3002    }
3003
3004    #[simd_test(enable = "sse")]
3005    unsafe fn test_mm_set_ss() {
3006        let r = _mm_set_ss(black_box(4.25));
3007        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
3008    }
3009
3010    #[simd_test(enable = "sse")]
3011    unsafe fn test_mm_set1_ps() {
3012        let r1 = _mm_set1_ps(black_box(4.25));
3013        let r2 = _mm_set_ps1(black_box(4.25));
3014        assert_eq!(get_m128(r1, 0), 4.25);
3015        assert_eq!(get_m128(r1, 1), 4.25);
3016        assert_eq!(get_m128(r1, 2), 4.25);
3017        assert_eq!(get_m128(r1, 3), 4.25);
3018        assert_eq!(get_m128(r2, 0), 4.25);
3019        assert_eq!(get_m128(r2, 1), 4.25);
3020        assert_eq!(get_m128(r2, 2), 4.25);
3021        assert_eq!(get_m128(r2, 3), 4.25);
3022    }
3023
3024    #[simd_test(enable = "sse")]
3025    unsafe fn test_mm_set_ps() {
3026        let r = _mm_set_ps(
3027            black_box(1.0),
3028            black_box(2.0),
3029            black_box(3.0),
3030            black_box(4.0),
3031        );
3032        assert_eq!(get_m128(r, 0), 4.0);
3033        assert_eq!(get_m128(r, 1), 3.0);
3034        assert_eq!(get_m128(r, 2), 2.0);
3035        assert_eq!(get_m128(r, 3), 1.0);
3036    }
3037
3038    #[simd_test(enable = "sse")]
3039    unsafe fn test_mm_setr_ps() {
3040        let r = _mm_setr_ps(
3041            black_box(1.0),
3042            black_box(2.0),
3043            black_box(3.0),
3044            black_box(4.0),
3045        );
3046        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3047    }
3048
3049    #[simd_test(enable = "sse")]
3050    unsafe fn test_mm_setzero_ps() {
3051        let r = *black_box(&_mm_setzero_ps());
3052        assert_eq_m128(r, _mm_set1_ps(0.0));
3053    }
3054
3055    #[simd_test(enable = "sse")]
3056    unsafe fn test_MM_SHUFFLE() {
3057        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
3058        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
3059        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
3060    }
3061
3062    #[simd_test(enable = "sse")]
3063    unsafe fn test_mm_shuffle_ps() {
3064        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3065        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3066        let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
3067        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
3068    }
3069
3070    #[simd_test(enable = "sse")]
3071    unsafe fn test_mm_unpackhi_ps() {
3072        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3073        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3074        let r = _mm_unpackhi_ps(a, b);
3075        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
3076    }
3077
3078    #[simd_test(enable = "sse")]
3079    unsafe fn test_mm_unpacklo_ps() {
3080        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3081        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3082        let r = _mm_unpacklo_ps(a, b);
3083        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
3084    }
3085
3086    #[simd_test(enable = "sse")]
3087    unsafe fn test_mm_movehl_ps() {
3088        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3089        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3090        let r = _mm_movehl_ps(a, b);
3091        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
3092    }
3093
3094    #[simd_test(enable = "sse")]
3095    unsafe fn test_mm_movelh_ps() {
3096        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3097        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3098        let r = _mm_movelh_ps(a, b);
3099        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
3100    }
3101
3102    #[simd_test(enable = "sse")]
3103    unsafe fn test_mm_load_ss() {
3104        let a = 42.0f32;
3105        let r = _mm_load_ss(ptr::addr_of!(a));
3106        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
3107    }
3108
3109    #[simd_test(enable = "sse")]
3110    unsafe fn test_mm_load1_ps() {
3111        let a = 42.0f32;
3112        let r = _mm_load1_ps(ptr::addr_of!(a));
3113        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
3114    }
3115
3116    #[simd_test(enable = "sse")]
3117    unsafe fn test_mm_load_ps() {
3118        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3119
3120        let mut p = vals.as_ptr();
3121        let mut fixup = 0.0f32;
3122
3123        // Make sure p is aligned, otherwise we might get a
3124        // (signal: 11, SIGSEGV: invalid memory reference)
3125
3126        let unalignment = (p as usize) & 0xf;
3127        if unalignment != 0 {
3128            let delta = (16 - unalignment) >> 2;
3129            fixup = delta as f32;
3130            p = p.add(delta);
3131        }
3132
3133        let r = _mm_load_ps(p);
3134        let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
3135        assert_eq_m128(r, e);
3136    }
3137
3138    #[simd_test(enable = "sse")]
3139    unsafe fn test_mm_loadu_ps() {
3140        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3141        let p = vals.as_ptr().add(3);
3142        let r = _mm_loadu_ps(black_box(p));
3143        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3144    }
3145
3146    #[simd_test(enable = "sse")]
3147    unsafe fn test_mm_loadr_ps() {
3148        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3149
3150        let mut p = vals.as_ptr();
3151        let mut fixup = 0.0f32;
3152
3153        // Make sure p is aligned, otherwise we might get a
3154        // (signal: 11, SIGSEGV: invalid memory reference)
3155
3156        let unalignment = (p as usize) & 0xf;
3157        if unalignment != 0 {
3158            let delta = (16 - unalignment) >> 2;
3159            fixup = delta as f32;
3160            p = p.add(delta);
3161        }
3162
3163        let r = _mm_loadr_ps(p);
3164        let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
3165        assert_eq_m128(r, e);
3166    }
3167
3168    #[simd_test(enable = "sse")]
3169    unsafe fn test_mm_store_ss() {
3170        let mut vals = [0.0f32; 8];
3171        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3172        _mm_store_ss(vals.as_mut_ptr().add(1), a);
3173
3174        assert_eq!(vals[0], 0.0);
3175        assert_eq!(vals[1], 1.0);
3176        assert_eq!(vals[2], 0.0);
3177    }
3178
3179    #[simd_test(enable = "sse")]
3180    unsafe fn test_mm_store1_ps() {
3181        let mut vals = [0.0f32; 8];
3182        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3183
3184        let mut ofs = 0;
3185        let mut p = vals.as_mut_ptr();
3186
3187        if (p as usize) & 0xf != 0 {
3188            ofs = (16 - ((p as usize) & 0xf)) >> 2;
3189            p = p.add(ofs);
3190        }
3191
3192        _mm_store1_ps(p, *black_box(&a));
3193
3194        if ofs > 0 {
3195            assert_eq!(vals[ofs - 1], 0.0);
3196        }
3197        assert_eq!(vals[ofs + 0], 1.0);
3198        assert_eq!(vals[ofs + 1], 1.0);
3199        assert_eq!(vals[ofs + 2], 1.0);
3200        assert_eq!(vals[ofs + 3], 1.0);
3201        assert_eq!(vals[ofs + 4], 0.0);
3202    }
3203
3204    #[simd_test(enable = "sse")]
3205    unsafe fn test_mm_store_ps() {
3206        let mut vals = [0.0f32; 8];
3207        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3208
3209        let mut ofs = 0;
3210        let mut p = vals.as_mut_ptr();
3211
3212        // Align p to 16-byte boundary
3213        if (p as usize) & 0xf != 0 {
3214            ofs = (16 - ((p as usize) & 0xf)) >> 2;
3215            p = p.add(ofs);
3216        }
3217
3218        _mm_store_ps(p, *black_box(&a));
3219
3220        if ofs > 0 {
3221            assert_eq!(vals[ofs - 1], 0.0);
3222        }
3223        assert_eq!(vals[ofs + 0], 1.0);
3224        assert_eq!(vals[ofs + 1], 2.0);
3225        assert_eq!(vals[ofs + 2], 3.0);
3226        assert_eq!(vals[ofs + 3], 4.0);
3227        assert_eq!(vals[ofs + 4], 0.0);
3228    }
3229
3230    #[simd_test(enable = "sse")]
3231    unsafe fn test_mm_storer_ps() {
3232        let mut vals = [0.0f32; 8];
3233        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3234
3235        let mut ofs = 0;
3236        let mut p = vals.as_mut_ptr();
3237
3238        // Align p to 16-byte boundary
3239        if (p as usize) & 0xf != 0 {
3240            ofs = (16 - ((p as usize) & 0xf)) >> 2;
3241            p = p.add(ofs);
3242        }
3243
3244        _mm_storer_ps(p, *black_box(&a));
3245
3246        if ofs > 0 {
3247            assert_eq!(vals[ofs - 1], 0.0);
3248        }
3249        assert_eq!(vals[ofs + 0], 4.0);
3250        assert_eq!(vals[ofs + 1], 3.0);
3251        assert_eq!(vals[ofs + 2], 2.0);
3252        assert_eq!(vals[ofs + 3], 1.0);
3253        assert_eq!(vals[ofs + 4], 0.0);
3254    }
3255
3256    #[simd_test(enable = "sse")]
3257    unsafe fn test_mm_storeu_ps() {
3258        let mut vals = [0.0f32; 8];
3259        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3260
3261        let mut ofs = 0;
3262        let mut p = vals.as_mut_ptr();
3263
3264        // Make sure p is **not** aligned to 16-byte boundary
3265        if (p as usize) & 0xf == 0 {
3266            ofs = 1;
3267            p = p.add(1);
3268        }
3269
3270        _mm_storeu_ps(p, *black_box(&a));
3271
3272        if ofs > 0 {
3273            assert_eq!(vals[ofs - 1], 0.0);
3274        }
3275        assert_eq!(vals[ofs + 0], 1.0);
3276        assert_eq!(vals[ofs + 1], 2.0);
3277        assert_eq!(vals[ofs + 2], 3.0);
3278        assert_eq!(vals[ofs + 3], 4.0);
3279        assert_eq!(vals[ofs + 4], 0.0);
3280    }
3281
3282    #[simd_test(enable = "sse")]
3283    unsafe fn test_mm_move_ss() {
3284        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3285        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3286
3287        let r = _mm_move_ss(a, b);
3288        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3289        assert_eq_m128(e, r);
3290    }
3291
3292    #[simd_test(enable = "sse")]
3293    unsafe fn test_mm_movemask_ps() {
3294        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3295        assert_eq!(r, 0b0101);
3296
3297        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3298        assert_eq!(r, 0b0111);
3299    }
3300
3301    #[simd_test(enable = "sse")]
3302    // Miri cannot support this until it is clear how it fits in the Rust memory model
3303    #[cfg_attr(miri, ignore)]
3304    unsafe fn test_mm_sfence() {
3305        _mm_sfence();
3306    }
3307
3308    #[simd_test(enable = "sse")]
3309    unsafe fn test_MM_TRANSPOSE4_PS() {
3310        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3311        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3312        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3313        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3314
3315        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3316
3317        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3318        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3319        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3320        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3321    }
3322
3323    #[repr(align(16))]
3324    struct Memory {
3325        pub data: [f32; 4],
3326    }
3327
3328    #[simd_test(enable = "sse")]
3329    // Miri cannot support this until it is clear how it fits in the Rust memory model
3330    // (non-temporal store)
3331    #[cfg_attr(miri, ignore)]
3332    unsafe fn test_mm_stream_ps() {
3333        let a = _mm_set1_ps(7.0);
3334        let mut mem = Memory { data: [-1.0; 4] };
3335
3336        _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
3337        _mm_sfence();
3338        for i in 0..4 {
3339            assert_eq!(mem.data[i], get_m128(a, i));
3340        }
3341    }
3342}