core/stdarch/crates/core_arch/src/x86/
sse3.rs

1//! Streaming SIMD Extensions 3 (SSE3)
2
3use crate::core_arch::{simd::*, x86::*};
4use crate::intrinsics::simd::*;
5
6#[cfg(test)]
7use stdarch_test::assert_instr;
8
9/// Alternatively add and subtract packed single-precision (32-bit)
10/// floating-point elements in `a` to/from packed elements in `b`.
11///
12/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_ps)
13#[inline]
14#[target_feature(enable = "sse3")]
15#[cfg_attr(test, assert_instr(addsubps))]
16#[stable(feature = "simd_x86", since = "1.27.0")]
17pub fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
18    unsafe {
19        let a = a.as_f32x4();
20        let b = b.as_f32x4();
21        let add = simd_add(a, b);
22        let sub = simd_sub(a, b);
23        simd_shuffle!(add, sub, [4, 1, 6, 3])
24    }
25}
26
27/// Alternatively add and subtract packed double-precision (64-bit)
28/// floating-point elements in `a` to/from packed elements in `b`.
29///
30/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd)
31#[inline]
32#[target_feature(enable = "sse3")]
33#[cfg_attr(test, assert_instr(addsubpd))]
34#[stable(feature = "simd_x86", since = "1.27.0")]
35pub fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
36    unsafe {
37        let a = a.as_f64x2();
38        let b = b.as_f64x2();
39        let add = simd_add(a, b);
40        let sub = simd_sub(a, b);
41        simd_shuffle!(add, sub, [2, 1])
42    }
43}
44
45/// Horizontally adds adjacent pairs of double-precision (64-bit)
46/// floating-point elements in `a` and `b`, and pack the results.
47///
48/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd)
49#[inline]
50#[target_feature(enable = "sse3")]
51#[cfg_attr(test, assert_instr(haddpd))]
52#[stable(feature = "simd_x86", since = "1.27.0")]
53pub fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
54    unsafe {
55        let even = simd_shuffle!(a, b, [0, 2]);
56        let odd = simd_shuffle!(a, b, [1, 3]);
57        simd_add(even, odd)
58    }
59}
60
61/// Horizontally adds adjacent pairs of single-precision (32-bit)
62/// floating-point elements in `a` and `b`, and pack the results.
63///
64/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps)
65#[inline]
66#[target_feature(enable = "sse3")]
67#[cfg_attr(test, assert_instr(haddps))]
68#[stable(feature = "simd_x86", since = "1.27.0")]
69pub fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
70    unsafe {
71        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
72        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
73        simd_add(even, odd)
74    }
75}
76
77/// Horizontally subtract adjacent pairs of double-precision (64-bit)
78/// floating-point elements in `a` and `b`, and pack the results.
79///
80/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd)
81#[inline]
82#[target_feature(enable = "sse3")]
83#[cfg_attr(test, assert_instr(hsubpd))]
84#[stable(feature = "simd_x86", since = "1.27.0")]
85pub fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
86    unsafe {
87        let even = simd_shuffle!(a, b, [0, 2]);
88        let odd = simd_shuffle!(a, b, [1, 3]);
89        simd_sub(even, odd)
90    }
91}
92
93/// Horizontally adds adjacent pairs of single-precision (32-bit)
94/// floating-point elements in `a` and `b`, and pack the results.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps)
97#[inline]
98#[target_feature(enable = "sse3")]
99#[cfg_attr(test, assert_instr(hsubps))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
102    unsafe {
103        let even = simd_shuffle!(a, b, [0, 2, 4, 6]);
104        let odd = simd_shuffle!(a, b, [1, 3, 5, 7]);
105        simd_sub(even, odd)
106    }
107}
108
109/// Loads 128-bits of integer data from unaligned memory.
110/// This intrinsic may perform better than `_mm_loadu_si128`
111/// when the data crosses a cache line boundary.
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128)
114#[inline]
115#[target_feature(enable = "sse3")]
116#[cfg_attr(test, assert_instr(lddqu))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
119    transmute(lddqu(mem_addr as *const _))
120}
121
122/// Duplicate the low double-precision (64-bit) floating-point element
123/// from `a`.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd)
126#[inline]
127#[target_feature(enable = "sse3")]
128#[cfg_attr(test, assert_instr(movddup))]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130pub fn _mm_movedup_pd(a: __m128d) -> __m128d {
131    unsafe { simd_shuffle!(a, a, [0, 0]) }
132}
133
134/// Loads a double-precision (64-bit) floating-point element from memory
135/// into both elements of return vector.
136///
137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd)
138#[inline]
139#[target_feature(enable = "sse3")]
140#[cfg_attr(test, assert_instr(movddup))]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
143    _mm_load1_pd(mem_addr)
144}
145
146/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
147/// from `a`.
148///
149/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps)
150#[inline]
151#[target_feature(enable = "sse3")]
152#[cfg_attr(test, assert_instr(movshdup))]
153#[stable(feature = "simd_x86", since = "1.27.0")]
154pub fn _mm_movehdup_ps(a: __m128) -> __m128 {
155    unsafe { simd_shuffle!(a, a, [1, 1, 3, 3]) }
156}
157
158/// Duplicate even-indexed single-precision (32-bit) floating-point elements
159/// from `a`.
160///
161/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps)
162#[inline]
163#[target_feature(enable = "sse3")]
164#[cfg_attr(test, assert_instr(movsldup))]
165#[stable(feature = "simd_x86", since = "1.27.0")]
166pub fn _mm_moveldup_ps(a: __m128) -> __m128 {
167    unsafe { simd_shuffle!(a, a, [0, 0, 2, 2]) }
168}
169
170#[allow(improper_ctypes)]
171unsafe extern "C" {
172    #[link_name = "llvm.x86.sse3.ldu.dq"]
173    fn lddqu(mem_addr: *const i8) -> i8x16;
174}
175
176#[cfg(test)]
177mod tests {
178    use stdarch_test::simd_test;
179
180    use crate::core_arch::x86::*;
181
182    #[simd_test(enable = "sse3")]
183    unsafe fn test_mm_addsub_ps() {
184        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
185        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
186        let r = _mm_addsub_ps(a, b);
187        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
188    }
189
190    #[simd_test(enable = "sse3")]
191    unsafe fn test_mm_addsub_pd() {
192        let a = _mm_setr_pd(-1.0, 5.0);
193        let b = _mm_setr_pd(-100.0, 20.0);
194        let r = _mm_addsub_pd(a, b);
195        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
196    }
197
198    #[simd_test(enable = "sse3")]
199    unsafe fn test_mm_hadd_pd() {
200        let a = _mm_setr_pd(-1.0, 5.0);
201        let b = _mm_setr_pd(-100.0, 20.0);
202        let r = _mm_hadd_pd(a, b);
203        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
204    }
205
206    #[simd_test(enable = "sse3")]
207    unsafe fn test_mm_hadd_ps() {
208        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
209        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
210        let r = _mm_hadd_ps(a, b);
211        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
212    }
213
214    #[simd_test(enable = "sse3")]
215    unsafe fn test_mm_hsub_pd() {
216        let a = _mm_setr_pd(-1.0, 5.0);
217        let b = _mm_setr_pd(-100.0, 20.0);
218        let r = _mm_hsub_pd(a, b);
219        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
220    }
221
222    #[simd_test(enable = "sse3")]
223    unsafe fn test_mm_hsub_ps() {
224        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
225        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
226        let r = _mm_hsub_ps(a, b);
227        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
228    }
229
230    #[simd_test(enable = "sse3")]
231    unsafe fn test_mm_lddqu_si128() {
232        #[rustfmt::skip]
233        let a = _mm_setr_epi8(
234            1, 2, 3, 4,
235            5, 6, 7, 8,
236            9, 10, 11, 12,
237            13, 14, 15, 16,
238        );
239        let r = _mm_lddqu_si128(&a);
240        assert_eq_m128i(a, r);
241    }
242
243    #[simd_test(enable = "sse3")]
244    unsafe fn test_mm_movedup_pd() {
245        let a = _mm_setr_pd(-1.0, 5.0);
246        let r = _mm_movedup_pd(a);
247        assert_eq_m128d(r, _mm_setr_pd(-1.0, -1.0));
248    }
249
250    #[simd_test(enable = "sse3")]
251    unsafe fn test_mm_movehdup_ps() {
252        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
253        let r = _mm_movehdup_ps(a);
254        assert_eq_m128(r, _mm_setr_ps(5.0, 5.0, -10.0, -10.0));
255    }
256
257    #[simd_test(enable = "sse3")]
258    unsafe fn test_mm_moveldup_ps() {
259        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
260        let r = _mm_moveldup_ps(a);
261        assert_eq_m128(r, _mm_setr_ps(-1.0, -1.0, 0.0, 0.0));
262    }
263
264    #[simd_test(enable = "sse3")]
265    unsafe fn test_mm_loaddup_pd() {
266        let d = -5.0;
267        let r = _mm_loaddup_pd(&d);
268        assert_eq_m128d(r, _mm_setr_pd(d, d));
269    }
270}