1 /* 2 Copyright (c) 2015-2021 Timur Gafarov 3 4 Boost Software License - Version 1.0 - August 17th, 2003 5 6 Permission is hereby granted, free of charge, to any person or organization 7 obtaining a copy of the software and accompanying documentation covered by 8 this license (the "Software") to use, reproduce, display, distribute, 9 execute, and transmit the Software, and to prepare derivative works of the 10 Software, and to permit third-parties to whom the Software is furnished to 11 do so, all subject to the following: 12 13 The copyright notices in the Software and this entire statement, including 14 the above license grant, this restriction and the following disclaimer, 15 must be included in all copies of the Software, in whole or in part, and 16 all derivative works of the Software, unless such copies or derivative 17 works are solely in the form of machine-executable object code generated by 18 a source language processor. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 */ 28 29 /** 30 * SSE-based optimizations for common vector and matrix operations 31 * 32 * Description: 33 * This module implements some frequently used vector and matrix operations 34 * using SSE instructions. Implementation is in WIP status. 35 * Module is compatible only with Digital Mars D Compiler. 36 * 37 * Copyright: Timur Gafarov 2015-2021. 38 * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0). 39 * Authors: Timur Gafarov 40 */ 41 module dlib.math.sse; 42 43 import dlib.math.vector; 44 import dlib.math.matrix; 45 46 version(GNU) 47 { 48 pragma(msg, "Warning: dlib.math.sse is not compatible with GNU D Compiler"); 49 } 50 51 version(DMD) 52 { 53 /// Vector addition 54 Vector4f sseAdd4(Vector4f a, Vector4f b) 55 { 56 asm 57 { 58 movups XMM0, a; 59 movups XMM1, b; 60 addps XMM0, XMM1; 61 movups a, XMM0; 62 } 63 64 return a; 65 } 66 67 /// Vector subtraction 68 Vector4f sseSub4(Vector4f a, Vector4f b) 69 { 70 asm 71 { 72 movups XMM0, a; 73 movups XMM1, b; 74 subps XMM0, XMM1; 75 movups a, XMM0; 76 } 77 78 return a; 79 } 80 81 /// Vector multiplication 82 Vector4f sseMul4(Vector4f a, Vector4f b) 83 { 84 asm 85 { 86 movups XMM0, a; 87 movups XMM1, b; 88 mulps XMM0, XMM1; 89 movups a, XMM0; 90 } 91 92 return a; 93 } 94 95 /// Vector division 96 Vector4f sseDiv4(Vector4f a, Vector4f b) 97 { 98 asm 99 { 100 movups XMM0, a; 101 movups XMM1, b; 102 divps XMM0, XMM1; 103 movups a, XMM0; 104 } 105 106 return a; 107 } 108 109 /// Vector dot product 110 float sseDot4(Vector4f a, Vector4f b) 111 { 112 asm 113 { 114 movups XMM0, a; 115 movups XMM1, b; 116 mulps XMM0, XMM1; 117 118 // Horizontal addition 119 movhlps XMM1, XMM0; 120 addps XMM0, XMM1; 121 movups XMM1, XMM0; 122 shufps XMM1, XMM1, 0x55; 123 addps XMM0, XMM1; 124 125 movups a, XMM0; 126 } 127 128 return a[0]; 129 } 130 131 /// Vector cross product 132 Vector4f sseCross3(Vector4f a, Vector4f b) 133 { 134 asm 135 { 136 movups XMM0, a; 137 movups XMM1, b; 138 movaps XMM2, XMM0; 139 movaps XMM3, XMM1; 140 141 shufps XMM0, XMM0, 0xC9; 142 shufps XMM1, XMM1, 0xD2; 143 shufps XMM2, XMM2, 0xD2; 144 shufps XMM3, XMM3, 0xC9; 145 146 mulps XMM0, XMM1; 147 mulps XMM2, XMM3; 148 149 subps XMM0, XMM2; 150 151 movups a, XMM0; 152 } 153 154 return a; 155 } 156 157 /// Matrix multiplication 158 Matrix4x4f sseMulMat4(Matrix4x4f a, Matrix4x4f b) 159 { 160 Matrix4x4f r; 161 Vector4f a_line, b_line, r_line; 162 float _b; 163 uint i, j; 164 Vector4f* _rp; 165 for (i = 0; i < 16; i += 4) 166 { 167 a_line = *cast(Vector4f*)(a.arrayof.ptr); 168 _b = *(b.arrayof.ptr + i); 169 asm 170 { 171 movups XMM0, a_line; 172 173 mov EAX, _b; 174 movd XMM1, EAX; 175 176 shufps XMM1, XMM1, 0; 177 178 mulps XMM0, XMM1; 179 movups r_line, XMM0; 180 } 181 182 for (j = 1; j < 4; j++) 183 { 184 a_line = *cast(Vector4f*)(a.arrayof.ptr + j * 4); 185 _b = *(b.arrayof.ptr + i + j); 186 asm 187 { 188 movups XMM0, a_line; 189 190 mov EAX, _b; 191 movd XMM1, EAX; 192 shufps XMM1, XMM1, 0; 193 194 mulps XMM0, XMM1; 195 196 movups XMM2, r_line; 197 addps XMM0, XMM2; 198 199 movups r_line, XMM0; 200 } 201 } 202 203 _rp = cast(Vector4f*)(r.arrayof.ptr + i); 204 version(X86) asm 205 { 206 mov EAX, _rp; 207 movups [EAX], XMM0; 208 } 209 version(X86_64) asm 210 { 211 mov RAX, _rp; 212 movups [RAX], XMM0; 213 } 214 } 215 216 return r; 217 } 218 }