1 /*
2 Copyright (c) 2015-2021 Timur Gafarov
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 /**
30  * SSE-based optimizations for common vector and matrix operations
31  *
32  * Description:
33  * This module implements some frequently used vector and matrix operations
34  * using SSE instructions. Implementation is in WIP status.
35  * Module is compatible only with Digital Mars D Compiler.
36  *
37  * Copyright: Timur Gafarov 2015-2021.
38  * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0).
39  * Authors: Timur Gafarov
40  */
41 module dlib.math.sse;
42 
43 import dlib.math.vector;
44 import dlib.math.matrix;
45 
46 version(GNU)
47 {
48     pragma(msg, "Warning: dlib.math.sse is not compatible with GNU D Compiler");
49 }
50 
51 version(DMD)
52 {
53     /// Vector addition
54     Vector4f sseAdd4(Vector4f a, Vector4f b)
55     {
56         asm
57         {
58             movups XMM0, a;
59             movups XMM1, b;
60             addps XMM0, XMM1;
61             movups a, XMM0;
62         }
63 
64         return a;
65     }
66 
67     /// Vector subtraction
68     Vector4f sseSub4(Vector4f a, Vector4f b)
69     {
70         asm
71         {
72             movups XMM0, a;
73             movups XMM1, b;
74             subps XMM0, XMM1;
75             movups a, XMM0;
76         }
77 
78         return a;
79     }
80 
81     /// Vector multiplication
82     Vector4f sseMul4(Vector4f a, Vector4f b)
83     {
84         asm
85         {
86             movups XMM0, a;
87             movups XMM1, b;
88             mulps XMM0, XMM1;
89             movups a, XMM0;
90         }
91 
92         return a;
93     }
94 
95     /// Vector division
96     Vector4f sseDiv4(Vector4f a, Vector4f b)
97     {
98         asm
99         {
100             movups XMM0, a;
101             movups XMM1, b;
102             divps XMM0, XMM1;
103             movups a, XMM0;
104         }
105 
106         return a;
107     }
108 
109     /// Vector dot product
110     float sseDot4(Vector4f a, Vector4f b)
111     {
112         asm
113         {
114             movups XMM0, a;
115             movups XMM1, b;
116             mulps XMM0, XMM1;
117 
118             // Horizontal addition
119             movhlps XMM1, XMM0;
120             addps XMM0, XMM1;
121             movups XMM1, XMM0;
122             shufps XMM1, XMM1, 0x55;
123             addps XMM0, XMM1;
124 
125             movups a, XMM0;
126         }
127 
128         return a[0];
129     }
130 
131     /// Vector cross product
132     Vector4f sseCross3(Vector4f a, Vector4f b)
133     {
134         asm
135         {
136             movups XMM0, a;
137             movups XMM1, b;
138             movaps XMM2, XMM0;
139             movaps XMM3, XMM1;
140 
141             shufps XMM0, XMM0, 0xC9;
142             shufps XMM1, XMM1, 0xD2;
143             shufps XMM2, XMM2, 0xD2;
144             shufps XMM3, XMM3, 0xC9;
145 
146             mulps XMM0, XMM1;
147             mulps XMM2, XMM3;
148 
149             subps XMM0, XMM2;
150 
151             movups a, XMM0;
152         }
153 
154         return a;
155     }
156 
157     /// Matrix multiplication
158     Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b)
159     {
160         Matrix4x4f r;
161         Vector4f a_line, b_line, r_line;
162         float _b;
163         uint i, j;
164         Vector4f* _rp;
165         for (i = 0; i < 16; i += 4)
166         {
167             a_line = *cast(Vector4f*)(a.arrayof.ptr);
168             _b = *(b.arrayof.ptr + i);
169             asm
170             {
171                 movups XMM0, a_line;
172 
173                 mov EAX, _b;
174                 movd XMM1, EAX;
175 
176                 shufps XMM1, XMM1, 0;
177 
178                 mulps XMM0, XMM1;
179                 movups r_line, XMM0;
180             }
181 
182             for (j = 1; j < 4; j++)
183             {
184                 a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4);
185                 _b = *(b.arrayof.ptr + i + j);
186                 asm
187                 {
188                     movups XMM0, a_line;
189 
190                     mov EAX, _b;
191                     movd XMM1, EAX;
192                     shufps XMM1, XMM1, 0;
193 
194                     mulps XMM0, XMM1;
195 
196                     movups XMM2, r_line;
197                     addps XMM0, XMM2;
198 
199                     movups r_line, XMM0;
200                 }
201             }
202 
203             _rp = cast(Vector4f*)(r.arrayof.ptr + i);
204             version(X86) asm
205             {
206                 mov EAX, _rp;
207                 movups [EAX], XMM0;
208             }
209             version(X86_64) asm
210             {
211                 mov RAX, _rp;
212                 movups [RAX], XMM0;
213             }
214         }
215 
216         return r;
217     }
218 }