From 4ed37ac68ee31641fd9e6fbb919953b0d3ea0c13 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 13 Jun 2014 19:00:24 +0200 Subject: [PATCH 02/11] SSE2: Add CMake options --HG-- branch : sse2 --- code/CMakeLists.txt | 7 +++++++ code/CMakeModules/nel.cmake | 3 +++ 2 files changed, 10 insertions(+) diff --git a/code/CMakeLists.txt b/code/CMakeLists.txt index 4f0439dfd..3fae8488c 100644 --- a/code/CMakeLists.txt +++ b/code/CMakeLists.txt @@ -131,6 +131,13 @@ IF(FINAL_VERSION) ADD_DEFINITIONS(-DFINAL_VERSION=1) ENDIF(FINAL_VERSION) +IF(WITH_SSE2) + ADD_DEFINITIONS(-DNL_HAS_SSE2) + IF(WITH_SSE3) + ADD_DEFINITIONS(-DNL_HAS_SSE3) + ENDIF(WITH_SSE3) +ENDIF(WITH_SSE2) + IF(WITH_QT) FIND_PACKAGE(Qt4 COMPONENTS QtCore QtGui QtXml QtOpenGL REQUIRED) ENDIF(WITH_QT) diff --git a/code/CMakeModules/nel.cmake b/code/CMakeModules/nel.cmake index b194b5ff9..6e55545d6 100644 --- a/code/CMakeModules/nel.cmake +++ b/code/CMakeModules/nel.cmake @@ -324,6 +324,9 @@ MACRO(NL_SETUP_NEL_DEFAULT_OPTIONS) OPTION(WITH_LIBOVR "With LibOVR support" OFF) OPTION(WITH_LIBVR "With LibVR support" OFF) OPTION(WITH_PERFHUD "With NVIDIA PerfHUD support" OFF) + + OPTION(WITH_SSE2 "With SSE2" ON ) + OPTION(WITH_SSE3 "With SSE3" ON ) ENDMACRO(NL_SETUP_NEL_DEFAULT_OPTIONS) MACRO(NL_SETUP_NELNS_DEFAULT_OPTIONS) From eee9ba0caeed52ba380ba8d9931db57714725455 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 13 Jun 2014 19:12:31 +0200 Subject: [PATCH 03/11] SSE2: Add aligned allocators --HG-- branch : sse2 --- code/nel/include/nel/misc/types_nl.h | 34 ++++++++++++++++++++++++++++ code/nel/src/misc/common.cpp | 29 ++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h index 5c3b80475..f4001fd94 100644 --- a/code/nel/include/nel/misc/types_nl.h +++ b/code/nel/include/nel/misc/types_nl.h @@ -328,6 +328,40 @@ typedef unsigned int uint; // at least 32bits (depend of processor) #endif // NL_OS_UNIX + +#ifdef NL_COMP_VC +#define NL_ALIGN(nb) __declspec(align(nb)) +#else +#define NL_ALIGN(nb) __attribute__((aligned(nb))) +#endif + +#ifdef NL_COMP_VC +inline void *aligned_malloc(size_t size, size_t alignment) { return _aligned_malloc(size, alignment); } +inline void aligned_free(void *ptr) { _aligned_free(ptr); } +#else +inline void *aligned_malloc(size_t size, size_t alignment) { return memalign(alignment, size); } +inline void aligned_free(void *ptr) { free(ptr); } +#endif /* NL_COMP_ */ + + +#ifdef NL_HAS_SSE2 + +#define NL_DEFAULT_MEMORY_ALIGNMENT 16 +#define NL_ALIGN_SSE2 NL_ALIGN(NL_DEFAULT_MEMORY_ALIGNMENT) + +extern void *operator new(size_t size) throw(std::bad_alloc); +extern void *operator new[](size_t size) throw(std::bad_alloc); +extern void operator delete(void *p) throw(); +extern void operator delete[](void *p) throw(); + +#else /* NL_HAS_SSE2 */ + +#define NL_DEFAULT_MEMORY_ALIGNMENT 4 +#define NL_ALIGN_SSE2(nb) + +#endif /* NL_HAS_SSE2 */ + + // CHashMap, CHashSet and CHashMultiMap definitions #if defined(_STLPORT_VERSION) // STLport detected # include diff --git a/code/nel/src/misc/common.cpp b/code/nel/src/misc/common.cpp index 36e167260..3c72b6ca4 100644 --- a/code/nel/src/misc/common.cpp +++ b/code/nel/src/misc/common.cpp @@ -71,6 +71,35 @@ extern "C" long _ftol2( double dblSource ) { return _ftol( dblSource ); } #endif // NL_OS_WINDOWS +#ifdef NL_HAS_SSE2 + +void *operator new(size_t size) throw(std::bad_alloc) +{ + void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT); + if (p == NULL) throw std::bad_alloc(); + return p; +} + +void *operator new[](size_t size) throw(std::bad_alloc) +{ + void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT); + if (p == NULL) throw std::bad_alloc(); + return p; +} + +void operator delete(void *p) throw() +{ + aligned_free(p); +} + +void operator delete[](void *p) throw() +{ + aligned_free(p); +} + +#endif /* NL_HAS_SSE2 */ + + #ifdef DEBUG_NEW #define new DEBUG_NEW #endif From 556a41afeef5bb4d01f81a33e6f264f9be24757f Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 13 Jun 2014 19:26:22 +0200 Subject: [PATCH 04/11] SSE2: Implement alignment for arena allocator --HG-- branch : sse2 --- .../nel/include/nel/misc/fixed_size_allocator.h | 1 + code/nel/src/misc/fixed_size_allocator.cpp | 17 ++++++++++++----- code/nel/src/misc/object_arena_allocator.cpp | 14 ++++++++------ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/code/nel/include/nel/misc/fixed_size_allocator.h b/code/nel/include/nel/misc/fixed_size_allocator.h index 9eb1d8a10..80b9ed491 100644 --- a/code/nel/include/nel/misc/fixed_size_allocator.h +++ b/code/nel/include/nel/misc/fixed_size_allocator.h @@ -53,6 +53,7 @@ public: uint getNumAllocatedBlocks() const { return _NumAlloc; } private: class CChunk; + NL_ALIGN(NL_DEFAULT_MEMORY_ALIGNMENT) class CNode { public: diff --git a/code/nel/src/misc/fixed_size_allocator.cpp b/code/nel/src/misc/fixed_size_allocator.cpp index 790275ec6..30693ddfd 100644 --- a/code/nel/src/misc/fixed_size_allocator.cpp +++ b/code/nel/src/misc/fixed_size_allocator.cpp @@ -33,6 +33,9 @@ CFixedSizeAllocator::CFixedSizeAllocator(uint numBytesPerBlock, uint numBlockPer _NumChunks = 0; nlassert(numBytesPerBlock > 1); _NumBytesPerBlock = numBytesPerBlock; + const uint mask = NL_DEFAULT_MEMORY_ALIGNMENT - 1; + _NumBytesPerBlock = (_NumBytesPerBlock + mask) & ~mask; + nlassert(_NumBytesPerBlock >= numBytesPerBlock); _NumBlockPerChunk = std::max(numBlockPerChunk, (uint) 3); _NumAlloc = 0; } @@ -67,12 +70,14 @@ void *CFixedSizeAllocator::alloc() return _FreeSpace->unlink(); } +#define aligned_offsetof(s, m) ((offsetof(s, m) + (NL_DEFAULT_MEMORY_ALIGNMENT - 1)) & ~(NL_DEFAULT_MEMORY_ALIGNMENT - 1)) + // ***************************************************************************************************************** void CFixedSizeAllocator::free(void *block) { if (!block) return; /// get the node from the object - CNode *node = (CNode *) ((uint8 *) block - offsetof(CNode, Next)); + CNode *node = (CNode *) ((uint8 *) block - aligned_offsetof(CNode, Next)); // nlassert(node->Chunk != NULL); nlassert(node->Chunk->Allocator == this); @@ -84,7 +89,9 @@ void CFixedSizeAllocator::free(void *block) // ***************************************************************************************************************** uint CFixedSizeAllocator::CChunk::getBlockSizeWithOverhead() const { - return std::max((uint)(sizeof(CNode) - offsetof(CNode, Next)),(uint)(Allocator->getNumBytesPerBlock())) + offsetof(CNode, Next); + nlctassert((sizeof(CNode) % NL_DEFAULT_MEMORY_ALIGNMENT) == 0); + return std::max((uint)(sizeof(CNode) - aligned_offsetof(CNode, Next)), + (uint)(Allocator->getNumBytesPerBlock())) + aligned_offsetof(CNode, Next); } // ***************************************************************************************************************** @@ -105,7 +112,7 @@ CFixedSizeAllocator::CChunk::~CChunk() nlassert(NumFreeObjs == 0); nlassert(Allocator->_NumChunks > 0); -- (Allocator->_NumChunks); - delete[] Mem; + aligned_free(Mem); //delete[] Mem; } // ***************************************************************************************************************** @@ -115,7 +122,7 @@ void CFixedSizeAllocator::CChunk::init(CFixedSizeAllocator *alloc) nlassert(alloc != NULL); Allocator = alloc; // - Mem = new uint8[getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk()]; + Mem = (uint8 *)aligned_malloc(getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk(), NL_DEFAULT_MEMORY_ALIGNMENT); // new uint8[getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk()]; // getNode(0).Chunk = this; getNode(0).Next = &getNode(1); @@ -179,7 +186,7 @@ void *CFixedSizeAllocator::CNode::unlink() *Prev = Next; nlassert(Chunk->NumFreeObjs > 0); Chunk->grab(); // tells the containing chunk that a node has been allocated - return (void *) &Next; + return (void *)((uintptr_t)(this) + aligned_offsetof(CNode, Next)); //(void *) &Next; } // ***************************************************************************************************************** diff --git a/code/nel/src/misc/object_arena_allocator.cpp b/code/nel/src/misc/object_arena_allocator.cpp index 9c73f5059..8084b4ac9 100644 --- a/code/nel/src/misc/object_arena_allocator.cpp +++ b/code/nel/src/misc/object_arena_allocator.cpp @@ -68,21 +68,23 @@ void *CObjectArenaAllocator::alloc(uint size) if (size >= _MaxAllocSize) { // use standard allocator - uint8 *block = new uint8[size + sizeof(uint)]; // an additionnal uint is needed to store size of block + nlctassert(NL_DEFAULT_MEMORY_ALIGNMENT > sizeof(uint)); + uint8 *block = (uint8 *)aligned_malloc(NL_DEFAULT_MEMORY_ALIGNMENT + size, NL_DEFAULT_MEMORY_ALIGNMENT); //new uint8[size + sizeof(uint)]; // an additionnal uint is needed to store size of block if (!block) return NULL; #ifdef NL_DEBUG _MemBlockToAllocID[block] = _AllocID; #endif *(uint *) block = size; - return block + sizeof(uint); + return block + NL_DEFAULT_MEMORY_ALIGNMENT; } uint entry = ((size + (_Granularity - 1)) / _Granularity) ; nlassert(entry < _ObjectSizeToAllocator.size()); if (!_ObjectSizeToAllocator[entry]) { - _ObjectSizeToAllocator[entry] = new CFixedSizeAllocator(entry * _Granularity + sizeof(uint), _MaxAllocSize / size); // an additionnal uint is needed to store size of block + _ObjectSizeToAllocator[entry] = new CFixedSizeAllocator(entry * _Granularity + NL_DEFAULT_MEMORY_ALIGNMENT, _MaxAllocSize / size); // an additionnal uint is needed to store size of block } void *block = _ObjectSizeToAllocator[entry]->alloc(); + nlassert(((uintptr_t)block % NL_DEFAULT_MEMORY_ALIGNMENT) == 0); #ifdef NL_DEBUG if (block) { @@ -91,14 +93,14 @@ void *CObjectArenaAllocator::alloc(uint size) ++_AllocID; #endif *(uint *) block = size; - return (void *) ((uint8 *) block + sizeof(uint)); + return (void *) ((uint8 *) block + NL_DEFAULT_MEMORY_ALIGNMENT); } // ***************************************************************************************************************** void CObjectArenaAllocator::free(void *block) { if (!block) return; - uint8 *realBlock = (uint8 *) block - sizeof(uint); // a uint is used at start of block to give its size + uint8 *realBlock = (uint8 *) block - NL_DEFAULT_MEMORY_ALIGNMENT; // sizeof(uint); // a uint is used at start of block to give its size uint size = *(uint *) realBlock; if (size >= _MaxAllocSize) { @@ -107,7 +109,7 @@ void CObjectArenaAllocator::free(void *block) nlassert(it != _MemBlockToAllocID.end()); _MemBlockToAllocID.erase(it); #endif - delete realBlock; + aligned_free(realBlock); return; } uint entry = ((size + (_Granularity - 1)) / _Granularity); From d3fb390cf321fbe79ba0a4b1afdb988b8931d5aa Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 13 Jun 2014 19:33:09 +0200 Subject: [PATCH 05/11] SSE2: Align CMatrix --HG-- branch : sse2 --- code/nel/include/nel/3d/computed_string.h | 2 +- code/nel/include/nel/misc/matrix.h | 1 + code/nel/src/3d/computed_string.cpp | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/code/nel/include/nel/3d/computed_string.h b/code/nel/include/nel/3d/computed_string.h index fcb758da4..517200383 100644 --- a/code/nel/include/nel/3d/computed_string.h +++ b/code/nel/include/nel/3d/computed_string.h @@ -290,7 +290,7 @@ public: * \param matrix transformation matrix * \param hotspot position of string origine */ - void render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot = MiddleMiddle); + void render3D (IDriver& driver, const CMatrix &matrix, THotSpot hotspot = MiddleMiddle); }; diff --git a/code/nel/include/nel/misc/matrix.h b/code/nel/include/nel/misc/matrix.h index 700eb4a14..37a19b655 100644 --- a/code/nel/include/nel/misc/matrix.h +++ b/code/nel/include/nel/misc/matrix.h @@ -53,6 +53,7 @@ class CPlane; * \author Nevrax France * \date 2000 */ +NL_ALIGN_SSE2 class CMatrix { public: diff --git a/code/nel/src/3d/computed_string.cpp b/code/nel/src/3d/computed_string.cpp index a57191cc0..1a9fefba5 100644 --- a/code/nel/src/3d/computed_string.cpp +++ b/code/nel/src/3d/computed_string.cpp @@ -143,11 +143,13 @@ void CComputedString::render2D (IDriver& driver, /*------------------------------------------------------------------*\ render3D() \*------------------------------------------------------------------*/ -void CComputedString::render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot) +void CComputedString::render3D (IDriver& driver, const CMatrix &matrixp, THotSpot hotspot) { if (Vertices.getNumVertices() == 0) return; + CMatrix matrix = matrixp; + // get window size uint32 wndWidth, wndHeight; driver.getWindowSize(wndWidth, wndHeight); From f6ad306a57374b79c9a1f056d0a724ebb85809e4 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 13 Jun 2014 20:25:45 +0200 Subject: [PATCH 06/11] SSE2: Add macro for force inline --HG-- branch : sse2 --- code/nel/include/nel/misc/types_nl.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h index f4001fd94..a9c6c6cfb 100644 --- a/code/nel/include/nel/misc/types_nl.h +++ b/code/nel/include/nel/misc/types_nl.h @@ -329,6 +329,19 @@ typedef unsigned int uint; // at least 32bits (depend of processor) #endif // NL_OS_UNIX +// #ifdef NL_ENABLE_FORCE_INLINE +# ifdef NL_COMP_VC +# define NL_FORCE_INLINE __forceinline +# elif NL_COMP_GCC +# define NL_FORCE_INLINE inline __attribute__((always_inline)) +# else +# define NL_FORCE_INLINE inline +# endif +// #else +// # define NL_FORCE_INLINE inline +// #endif + + #ifdef NL_COMP_VC #define NL_ALIGN(nb) __declspec(align(nb)) #else From 848932f93aa8316a38782f1f477c5520672d0b7b Mon Sep 17 00:00:00 2001 From: kaetemi Date: Thu, 19 Jun 2014 00:05:43 +0200 Subject: [PATCH 07/11] SSE2: Fix for MinGW --HG-- branch : sse2 --- code/nel/include/nel/misc/fixed_size_allocator.h | 4 ++-- code/nel/include/nel/misc/matrix.h | 4 ++-- code/nel/include/nel/misc/types_nl.h | 7 +++++-- code/nel/src/misc/matrix.cpp | 9 +++++++++ 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/code/nel/include/nel/misc/fixed_size_allocator.h b/code/nel/include/nel/misc/fixed_size_allocator.h index 80b9ed491..079c54bc0 100644 --- a/code/nel/include/nel/misc/fixed_size_allocator.h +++ b/code/nel/include/nel/misc/fixed_size_allocator.h @@ -53,8 +53,8 @@ public: uint getNumAllocatedBlocks() const { return _NumAlloc; } private: class CChunk; - NL_ALIGN(NL_DEFAULT_MEMORY_ALIGNMENT) - class CNode + + class NL_ALIGN(NL_DEFAULT_MEMORY_ALIGNMENT) CNode { public: CChunk *Chunk; // the Chunk this node belongs to. diff --git a/code/nel/include/nel/misc/matrix.h b/code/nel/include/nel/misc/matrix.h index 37a19b655..649d53324 100644 --- a/code/nel/include/nel/misc/matrix.h +++ b/code/nel/include/nel/misc/matrix.h @@ -53,8 +53,8 @@ class CPlane; * \author Nevrax France * \date 2000 */ -NL_ALIGN_SSE2 -class CMatrix + +class NL_ALIGN_SSE2 CMatrix { public: /// Rotation Order. diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h index 8eaa77930..6f41deff1 100644 --- a/code/nel/include/nel/misc/types_nl.h +++ b/code/nel/include/nel/misc/types_nl.h @@ -342,7 +342,7 @@ typedef unsigned int uint; // at least 32bits (depend of processor) // #ifdef NL_ENABLE_FORCE_INLINE # ifdef NL_COMP_VC # define NL_FORCE_INLINE __forceinline -# elif NL_COMP_GCC +# elif defined(NL_COMP_GCC) # define NL_FORCE_INLINE inline __attribute__((always_inline)) # else # define NL_FORCE_INLINE inline @@ -358,7 +358,10 @@ typedef unsigned int uint; // at least 32bits (depend of processor) #define NL_ALIGN(nb) __attribute__((aligned(nb))) #endif -#ifdef NL_COMP_VC +#ifdef NL_OS_WINDOWS +#include +#include +#include inline void *aligned_malloc(size_t size, size_t alignment) { return _aligned_malloc(size, alignment); } inline void aligned_free(void *ptr) { _aligned_free(ptr); } #else diff --git a/code/nel/src/misc/matrix.cpp b/code/nel/src/misc/matrix.cpp index dd884f4d5..acfe7ee96 100644 --- a/code/nel/src/misc/matrix.cpp +++ b/code/nel/src/misc/matrix.cpp @@ -140,6 +140,11 @@ inline void CMatrix::testExpandRot() const self->Scale33= 1; } } +void CMatrix::testExpandRotEx() const +{ + testExpandRot(); +} + inline void CMatrix::testExpandProj() const { if(hasProj()) @@ -151,6 +156,10 @@ inline void CMatrix::testExpandProj() const self->a41=0; self->a42=0; self->a43=0; self->a44=1; } } +void CMatrix::testExpandProjEx() const +{ + testExpandProj(); +} // ====================================================================================================== From 71a598db7eb77c3e3cca2bc23c03bbd4335ad26f Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 13 Jun 2014 22:21:07 +0200 Subject: [PATCH 08/11] SSE2: Remove dead code --HG-- branch : sse2 --- code/nel/include/nel/3d/matrix_3x4.h | 275 --------------------------- code/nel/src/3d/mesh_mrm_skin.cpp | 118 ------------ code/nel/src/3d/mesh_mrm_skinned.cpp | 117 ------------ 3 files changed, 510 deletions(-) diff --git a/code/nel/include/nel/3d/matrix_3x4.h b/code/nel/include/nel/3d/matrix_3x4.h index d7ed660fc..45901c20e 100644 --- a/code/nel/include/nel/3d/matrix_3x4.h +++ b/code/nel/include/nel/3d/matrix_3x4.h @@ -108,281 +108,6 @@ public: }; -// *************************************************************************** -// *************************************************************************** -// SSE Matrix -// *************************************************************************** -// *************************************************************************** - - -// *************************************************************************** -#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) - - -/** For fast vector/point multiplication. Special usage for Skinning. - * NB: SSE is no more used (no speed gain, some memory problem), but keep it for possible future usage. - */ -class CMatrix3x4SSE -{ -public: - // Order them in memory column first, for SSE column multiplication. - float a11, a21, a31, a41; - float a12, a22, a32, a42; - float a13, a23, a33, a43; - float a14, a24, a34, a44; - - // Copy from a matrix. - void set(const CMatrix &mat) - { - const float *m =mat.get(); - a11= m[0]; a12= m[4]; a13= m[8] ; a14= m[12]; - a21= m[1]; a22= m[5]; a23= m[9] ; a24= m[13]; - a31= m[2]; a32= m[6]; a33= m[10]; a34= m[14]; - // not used. - a41= 0 ; a42= 0 ; a43= 0 ; a44= 1; - } - - - // mulSetvector. NB: in should be different as v!! (else don't work). - void mulSetVector(const CVector &vin, CVector &vout) - { - __asm - { - mov eax, vin - mov ebx, this - mov edi, vout - // Load in vector in op[0] - movss xmm0, [eax]vin.x - movss xmm1, [eax]vin.y - movss xmm2, [eax]vin.z - // Expand op[0] to op[1], op[2], op[3] - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - // Mul each vector with 3 Matrix column - mulps xmm0, [ebx]this.a11 - mulps xmm1, [ebx]this.a12 - mulps xmm2, [ebx]this.a13 - // Add each column vector. - addps xmm0, xmm1 - addps xmm0, xmm2 - - // write the result. - movss [edi]vout.x, xmm0 - shufps xmm0, xmm0, 33 - movss [edi]vout.y, xmm0 - movhlps xmm0, xmm0 - movss [edi]vout.z, xmm0 - } - } - // mulSetpoint. NB: in should be different as v!! (else don't work). - void mulSetPoint(const CVector &vin, CVector &vout) - { - __asm - { - mov eax, vin - mov ebx, this - mov edi, vout - // Load in vector in op[0] - movss xmm0, [eax]vin.x - movss xmm1, [eax]vin.y - movss xmm2, [eax]vin.z - // Expand op[0] to op[1], op[2], op[3] - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - // Mul each vector with 3 Matrix column - mulps xmm0, [ebx]this.a11 - mulps xmm1, [ebx]this.a12 - mulps xmm2, [ebx]this.a13 - // Add each column vector. - addps xmm0, xmm1 - addps xmm0, xmm2 - // Add Matrix translate column vector - addps xmm0, [ebx]this.a14 - - // write the result. - movss [edi]vout.x, xmm0 - shufps xmm0, xmm0, 33 - movss [edi]vout.y, xmm0 - movhlps xmm0, xmm0 - movss [edi]vout.z, xmm0 - } - } - - - // mulSetvector. NB: vin should be different as v!! (else don't work). - void mulSetVector(const CVector &vin, float scale, CVector &vout) - { - __asm - { - mov eax, vin - mov ebx, this - mov edi, vout - // Load in vector in op[0] - movss xmm0, [eax]vin.x - movss xmm1, [eax]vin.y - movss xmm2, [eax]vin.z - // Load scale in op[0] - movss xmm3, scale - // Expand op[0] to op[1], op[2], op[3] - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - shufps xmm3, xmm3, 0 - // Store vertex column in other regs. - movaps xmm5, xmm0 - movaps xmm6, xmm1 - movaps xmm7, xmm2 - // Mul each vector with 3 Matrix column - mulps xmm0, [ebx]this.a11 - mulps xmm1, [ebx]this.a12 - mulps xmm2, [ebx]this.a13 - // Add each column vector. - addps xmm0, xmm1 - addps xmm0, xmm2 - - // mul final result with scale - mulps xmm0, xmm3 - - // store it in xmm4 for future use. - movaps xmm4, xmm0 - } - } - // mulSetpoint. NB: vin should be different as v!! (else don't work). - void mulSetPoint(const CVector &vin, float scale, CVector &vout) - { - __asm - { - mov eax, vin - mov ebx, this - mov edi, vout - // Load in vector in op[0] - movss xmm0, [eax]vin.x - movss xmm1, [eax]vin.y - movss xmm2, [eax]vin.z - // Load scale in op[0] - movss xmm3, scale - // Expand op[0] to op[1], op[2], op[3] - shufps xmm0, xmm0, 0 - shufps xmm1, xmm1, 0 - shufps xmm2, xmm2, 0 - shufps xmm3, xmm3, 0 - // Store vertex column in other regs. - movaps xmm5, xmm0 - movaps xmm6, xmm1 - movaps xmm7, xmm2 - // Mul each vector with 3 Matrix column - mulps xmm0, [ebx]this.a11 - mulps xmm1, [ebx]this.a12 - mulps xmm2, [ebx]this.a13 - // Add each column vector. - addps xmm0, xmm1 - addps xmm0, xmm2 - // Add Matrix translate column vector - addps xmm0, [ebx]this.a14 - - // mul final result with scale - mulps xmm0, xmm3 - - // store it in xmm4 for future use. - movaps xmm4, xmm0 - } - } - - - // mulAddvector. NB: vin should be different as v!! (else don't work). - void mulAddVector(const CVector &/* vin */, float scale, CVector &vout) - { - __asm - { - mov ebx, this - mov edi, vout - // Load vin vector loaded in mulSetVector - movaps xmm0, xmm5 - movaps xmm1, xmm6 - movaps xmm2, xmm7 - // Load scale in op[0] - movss xmm3, scale - // Expand op[0] to op[1], op[2], op[3] - shufps xmm3, xmm3, 0 - // Mul each vector with 3 Matrix column - mulps xmm0, [ebx]this.a11 - mulps xmm1, [ebx]this.a12 - mulps xmm2, [ebx]this.a13 - // Add each column vector. - addps xmm0, xmm1 - addps xmm0, xmm2 - - // mul final result with scale - mulps xmm0, xmm3 - - // Add result, with prec sum. - addps xmm0, xmm4 - - // store it in xmm4 for future use. - movaps xmm4, xmm0 - - // write the result. - movss [edi]vout.x, xmm0 - shufps xmm0, xmm0, 33 - movss [edi]vout.y, xmm0 - movhlps xmm0, xmm0 - movss [edi]vout.z, xmm0 - } - } - // mulAddpoint. NB: vin should be different as v!! (else don't work). - void mulAddPoint(const CVector &/* vin */, float scale, CVector &vout) - { - __asm - { - mov ebx, this - mov edi, vout - // Load vin vector loaded in mulSetPoint - movaps xmm0, xmm5 - movaps xmm1, xmm6 - movaps xmm2, xmm7 - // Load scale in op[0] - movss xmm3, scale - // Expand op[0] to op[1], op[2], op[3] - shufps xmm3, xmm3, 0 - // Mul each vector with 3 Matrix column - mulps xmm0, [ebx]this.a11 - mulps xmm1, [ebx]this.a12 - mulps xmm2, [ebx]this.a13 - // Add each column vector. - addps xmm0, xmm1 - addps xmm0, xmm2 - // Add Matrix translate column vector - addps xmm0, [ebx]this.a14 - - // mul final result with scale - mulps xmm0, xmm3 - - // Add result, with prec sum. - addps xmm0, xmm4 - - // store it in xmm4 for future use. - movaps xmm4, xmm0 - - // write the result. - movss [edi]vout.x, xmm0 - shufps xmm0, xmm0, 33 - movss [edi]vout.y, xmm0 - movhlps xmm0, xmm0 - movss [edi]vout.z, xmm0 - } - } - -}; - -#else // NL_OS_WINDOWS -/// dummy CMatrix3x4SSE for non windows platform -class CMatrix3x4SSE : public CMatrix3x4 { }; -#endif - - - } // NL3D diff --git a/code/nel/src/3d/mesh_mrm_skin.cpp b/code/nel/src/3d/mesh_mrm_skin.cpp index 13e8bdd21..a34eeb766 100644 --- a/code/nel/src/3d/mesh_mrm_skin.cpp +++ b/code/nel/src/3d/mesh_mrm_skin.cpp @@ -39,124 +39,6 @@ namespace NL3D { -// *************************************************************************** -// *************************************************************************** -// CMatrix3x4SSE array correctly aligned -// *************************************************************************** -// *************************************************************************** - - - -// *************************************************************************** -#define NL3D_SSE_ALIGNEMENT 16 -/** - * A CMatrix3x4SSE array correctly aligned - * NB: SSE is no more used (no speed gain, some memory problem), but keep it for possible future usage. - */ -class CMatrix3x4SSEArray -{ -private: - void *_AllocData; - void *_Data; - uint _Size; - uint _Capacity; - -public: - CMatrix3x4SSEArray() - { - _AllocData= NULL; - _Data= NULL; - _Size= 0; - _Capacity= 0; - } - ~CMatrix3x4SSEArray() - { - clear(); - } - CMatrix3x4SSEArray(const CMatrix3x4SSEArray &other) - { - _AllocData= NULL; - _Data= NULL; - _Size= 0; - _Capacity= 0; - *this= other; - } - CMatrix3x4SSEArray &operator=(const CMatrix3x4SSEArray &other) - { - if( this == &other) - return *this; - resize(other.size()); - // copy data from aligned pointers to aligned pointers. - memcpy(_Data, other._Data, size() * sizeof(CMatrix3x4SSE) ); - - return *this; - } - - - CMatrix3x4SSE *getPtr() - { - return (CMatrix3x4SSE*)_Data; - } - - void clear() - { - delete [] ((uint8 *)_AllocData); - _AllocData= NULL; - _Data= NULL; - _Size= 0; - _Capacity= 0; - } - - void resize(uint n) - { - // reserve ?? - if(n>_Capacity) - reserve( max(2*_Capacity, n)); - _Size= n; - } - - void reserve(uint n) - { - if(n==0) - clear(); - else if(n>_Capacity) - { - // Alloc new data. - void *newAllocData; - void *newData; - - // Alloc for alignement. - newAllocData= new uint8 [n * sizeof(CMatrix3x4SSE) + NL3D_SSE_ALIGNEMENT-1]; - if(newAllocData==NULL) - throw Exception("SSE Allocation Failed"); - - // Align ptr - newData= (void*) ( ((ptrdiff_t)newAllocData+NL3D_SSE_ALIGNEMENT-1) & (~(NL3D_SSE_ALIGNEMENT-1)) ); - - // copy valid data from old to new. - memcpy(newData, _Data, size() * sizeof(CMatrix3x4SSE) ); - - // release old. - if(_AllocData) - delete [] ((uint8*)_AllocData); - - // change ptrs and capacity. - _Data= newData; - _AllocData= newAllocData; - _Capacity= n; - - // TestYoyo - //nlwarning("YOYO Tst SSE P4: %X, %d", _Data, n); - } - } - - uint size() const {return _Size;} - - - CMatrix3x4SSE &operator[](uint i) {return ((CMatrix3x4SSE*)_Data)[i];} -}; - - // *************************************************************************** // *************************************************************************** diff --git a/code/nel/src/3d/mesh_mrm_skinned.cpp b/code/nel/src/3d/mesh_mrm_skinned.cpp index 2b1c3beb6..6af29bf73 100644 --- a/code/nel/src/3d/mesh_mrm_skinned.cpp +++ b/code/nel/src/3d/mesh_mrm_skinned.cpp @@ -2247,123 +2247,6 @@ void CMeshMRMSkinnedGeom::getSkinWeights (std::vector &skinW } } -// *************************************************************************** -// *************************************************************************** -// CMatrix3x4SSE array correctly aligned -// *************************************************************************** -// *************************************************************************** - - - -// *************************************************************************** -#define NL3D_SSE_ALIGNEMENT 16 -/** - * A CMatrix3x4SSEArray array correctly aligned - * NB: SSE is no more used (no speed gain, some memory problem), but keep it for possible future usage. - */ -class CMatrix3x4SSEArray -{ -private: - void *_AllocData; - void *_Data; - uint _Size; - uint _Capacity; - -public: - CMatrix3x4SSEArray() - { - _AllocData= NULL; - _Data= NULL; - _Size= 0; - _Capacity= 0; - } - ~CMatrix3x4SSEArray() - { - clear(); - } - CMatrix3x4SSEArray(const CMatrix3x4SSEArray &other) - { - _AllocData= NULL; - _Data= NULL; - _Size= 0; - _Capacity= 0; - *this= other; - } - CMatrix3x4SSEArray &operator=(const CMatrix3x4SSEArray &other) - { - if( this == &other) - return *this; - resize(other.size()); - // copy data from aligned pointers to aligned pointers. - memcpy(_Data, other._Data, size() * sizeof(CMatrix3x4SSE) ); - - return *this; - } - - - CMatrix3x4SSE *getPtr() - { - return (CMatrix3x4SSE*)_Data; - } - - void clear() - { - delete [] ((uint8 *) _AllocData); - _AllocData= NULL; - _Data= NULL; - _Size= 0; - _Capacity= 0; - } - - void resize(uint n) - { - // reserve ?? - if(n>_Capacity) - reserve( max(2*_Capacity, n)); - _Size= n; - } - - void reserve(uint n) - { - if(n==0) - clear(); - else if(n>_Capacity) - { - // Alloc new data. - void *newAllocData; - void *newData; - - // Alloc for alignement. - newAllocData= new uint8 [n * sizeof(CMatrix3x4SSE) + NL3D_SSE_ALIGNEMENT-1]; - if(newAllocData==NULL) - throw Exception("SSE Allocation Failed"); - - // Align ptr - newData= (void*) ( ((ptrdiff_t)newAllocData+NL3D_SSE_ALIGNEMENT-1) & (~(NL3D_SSE_ALIGNEMENT-1)) ); - - // copy valid data from old to new. - memcpy(newData, _Data, size() * sizeof(CMatrix3x4SSE) ); - - // release old. - if(_AllocData) - delete [] ((uint8*)_AllocData); - - // change ptrs and capacity. - _Data= newData; - _AllocData= newAllocData; - _Capacity= n; - - // TestYoyo - //nlwarning("YOYO Tst SSE P4: %X, %d", _Data, n); - } - } - - uint size() const {return _Size;} - - - CMatrix3x4SSE &operator[](uint i) {return ((CMatrix3x4SSE*)_Data)[i];} -}; - // *************************************************************************** // *************************************************************************** From 6734852fa350ee096c249d0a88d5ac3df1b7a84f Mon Sep 17 00:00:00 2001 From: kaetemi Date: Sat, 14 Jun 2014 00:38:35 +0200 Subject: [PATCH 09/11] SSE2: Ensure correct allocator is used --HG-- branch : sse2 --- code/nel/include/nel/misc/object_vector.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code/nel/include/nel/misc/object_vector.h b/code/nel/include/nel/misc/object_vector.h index a544a62b5..ea9be81e2 100644 --- a/code/nel/include/nel/misc/object_vector.h +++ b/code/nel/include/nel/misc/object_vector.h @@ -29,6 +29,12 @@ # endif // NLMISC_HEAP_ALLOCATION_NDEBUG #endif // NL_USE_DEFAULT_MEMORY_MANAGER +#ifndef NL_OV_USE_NEW_ALLOCATOR +# ifdef NL_HAS_SSE2 +# define NL_OV_USE_NEW_ALLOCATOR +# endif // NL_HAS_SSE2 +#endif // NL_OV_USE_NEW_ALLOCATOR + namespace NLMISC { From cec2970169fabb98b9f73bfd756ebf256f940c69 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Sat, 14 Jun 2014 01:09:05 +0200 Subject: [PATCH 10/11] SSE2: Replace prefetch --HG-- branch : sse2 --- code/nel/src/3d/mesh_mrm_skin_template.cpp | 38 ++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/code/nel/src/3d/mesh_mrm_skin_template.cpp b/code/nel/src/3d/mesh_mrm_skin_template.cpp index 1958cae90..4a9e11106 100644 --- a/code/nel/src/3d/mesh_mrm_skin_template.cpp +++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp @@ -39,7 +39,23 @@ static void applyArraySkinNormalT(uint numMatrixes, uint32 *infPtr, CMesh::CSkin { /* Prefetch all vertex/normal before, it is to be faster. */ -#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) +#ifdef NL_HAS_SSE2 + { + uint nInfTmp= nInf; + uint32 *infTmpPtr= infPtr; + for(;nInfTmp>0;nInfTmp--, infTmpPtr++) + { + uint index= *infTmpPtr; + CMesh::CSkinWeight *srcSkin= srcSkinPtr + index; + CVector *srcVertex= srcVertexPtr + index; + CVector *srcNormal= srcNormalPtr + index; + + _mm_prefetch((const char *)(void *)srcSkin, _MM_HINT_T1); + _mm_prefetch((const char *)(void *)srcVertex, _MM_HINT_T1); + _mm_prefetch((const char *)(void *)srcNormal, _MM_HINT_T1); + } + } +#elif defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) { uint nInfTmp= nInf; uint32 *infTmpPtr= infPtr; @@ -176,7 +192,25 @@ static void applyArraySkinTangentSpaceT(uint numMatrixes, uint32 *infPtr, CMesh: { /* Prefetch all vertex/normal/tgSpace before, it is faster. */ -#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) +#ifdef NL_HAS_SSE2 + { + uint nInfTmp= nInf; + uint32 *infTmpPtr= infPtr; + for(;nInfTmp>0;nInfTmp--, infTmpPtr++) + { + uint index= *infTmpPtr; + CMesh::CSkinWeight *srcSkin= srcSkinPtr + index; + CVector *srcVertex= srcVertexPtr + index; + CVector *srcNormal= srcNormalPtr + index; + CVector *srcTgSpace= tgSpacePtr + index; + + _mm_prefetch((const char *)(void *)srcSkin, _MM_HINT_T1); + _mm_prefetch((const char *)(void *)srcVertex, _MM_HINT_T1); + _mm_prefetch((const char *)(void *)srcNormal, _MM_HINT_T1); + _mm_prefetch((const char *)(void *)srcTgSpace, _MM_HINT_T1); + } + } +#elif defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) { uint nInfTmp= nInf; uint32 *infTmpPtr= infPtr; From 43a061bebd2e2fb8cf3702332fd1775c3a89ad16 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 20 Jun 2014 03:08:51 +0200 Subject: [PATCH 11/11] SSE2: Compile fix --HG-- branch : sse2 --- code/nel/include/nel/misc/types_nl.h | 2 +- code/nel/src/misc/matrix.cpp | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h index 6f41deff1..d0111fb08 100644 --- a/code/nel/include/nel/misc/types_nl.h +++ b/code/nel/include/nel/misc/types_nl.h @@ -383,7 +383,7 @@ extern void operator delete[](void *p) throw(); #else /* NL_HAS_SSE2 */ #define NL_DEFAULT_MEMORY_ALIGNMENT 4 -#define NL_ALIGN_SSE2(nb) +#define NL_ALIGN_SSE2 #endif /* NL_HAS_SSE2 */ diff --git a/code/nel/src/misc/matrix.cpp b/code/nel/src/misc/matrix.cpp index acfe7ee96..e907fb2e0 100644 --- a/code/nel/src/misc/matrix.cpp +++ b/code/nel/src/misc/matrix.cpp @@ -140,10 +140,6 @@ inline void CMatrix::testExpandRot() const self->Scale33= 1; } } -void CMatrix::testExpandRotEx() const -{ - testExpandRot(); -} inline void CMatrix::testExpandProj() const { @@ -156,10 +152,6 @@ inline void CMatrix::testExpandProj() const self->a41=0; self->a42=0; self->a43=0; self->a44=1; } } -void CMatrix::testExpandProjEx() const -{ - testExpandProj(); -} // ======================================================================================================