Merge with sse2 (align all memory allocations, align CMatrix)

2024-12-30 04:40:53 +00:00 · 2014-06-20 07:19:04 +02:00 · 2014-06-20 07:19:04 +02:00 · ab7faa256e
commit ab7faa256e
parent 93e97c56cd 43a061bebd
16 changed files with 160 additions and 527 deletions
--- a/code/CMakeLists.txt
+++ b/code/CMakeLists.txt
@ -131,6 +131,13 @@ IF(FINAL_VERSION)
  ADD_DEFINITIONS(-DFINAL_VERSION=1)
 ENDIF(FINAL_VERSION)

+IF(WITH_SSE2)
+  ADD_DEFINITIONS(-DNL_HAS_SSE2)
+  IF(WITH_SSE3)
+    ADD_DEFINITIONS(-DNL_HAS_SSE3)
+  ENDIF(WITH_SSE3)
+ENDIF(WITH_SSE2)
+
 IF(WITH_QT)
  FIND_PACKAGE(Qt4 COMPONENTS QtCore QtGui QtXml QtOpenGL REQUIRED)
 ENDIF(WITH_QT)
--- a/code/CMakeModules/nel.cmake
+++ b/code/CMakeModules/nel.cmake
@ -324,6 +324,9 @@ MACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)
  OPTION(WITH_LIBOVR              "With LibOVR support"                           OFF)
  OPTION(WITH_LIBVR               "With LibVR support"                            OFF)
  OPTION(WITH_PERFHUD             "With NVIDIA PerfHUD support"                   OFF)
+  
+  OPTION(WITH_SSE2                "With SSE2"                                     ON )
+  OPTION(WITH_SSE3                "With SSE3"                                     ON )
 ENDMACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)

 MACRO(NL_SETUP_NELNS_DEFAULT_OPTIONS)
--- a/code/nel/include/nel/3d/computed_string.h
+++ b/code/nel/include/nel/3d/computed_string.h
@ -290,7 +290,7 @@ public:
 	 * \param matrix transformation matrix
 	 * \param hotspot position of string origine
 	 */
-	void render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot = MiddleMiddle);
+	void render3D (IDriver& driver, const CMatrix &matrix, THotSpot hotspot = MiddleMiddle);

 };

--- a/code/nel/include/nel/3d/matrix_3x4.h
+++ b/code/nel/include/nel/3d/matrix_3x4.h
@ -108,281 +108,6 @@ public:
 };


-// ***************************************************************************
-// ***************************************************************************
-// SSE Matrix
-// ***************************************************************************
-// ***************************************************************************
-
-
-// ***************************************************************************
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
-
-
-/** For fast vector/point multiplication. Special usage for Skinning.
- *	NB: SSE is no more used (no speed gain, some memory problem), but keep it for possible future usage.
- */
-class	CMatrix3x4SSE
-{
-public:
-	// Order them in memory column first, for SSE column multiplication.
-	float	a11, a21, a31, a41;
-	float	a12, a22, a32, a42;
-	float	a13, a23, a33, a43;
-	float	a14, a24, a34, a44;
-
-	// Copy from a matrix.
-	void	set(const CMatrix &mat)
-	{
-		const float	*m =mat.get();
-		a11= m[0]; a12= m[4]; a13= m[8] ; a14= m[12];
-		a21= m[1]; a22= m[5]; a23= m[9] ; a24= m[13];
-		a31= m[2]; a32= m[6]; a33= m[10]; a34= m[14];
-		// not used.
-		a41= 0   ; a42= 0   ; a43= 0    ; a44= 1;
-	}
-
-
-	// mulSetvector. NB: in should be different as v!! (else don't work).
-	void	mulSetVector(const CVector &vin, CVector &vout)
-	{
-		__asm
-		{
-			mov		eax, vin
-			mov		ebx, this
-			mov		edi, vout
-			// Load in vector in op[0]
-			movss	xmm0, [eax]vin.x
-			movss	xmm1, [eax]vin.y
-			movss	xmm2, [eax]vin.z
-			// Expand op[0] to op[1], op[2], op[3]
-			shufps	xmm0, xmm0, 0
-			shufps	xmm1, xmm1, 0
-			shufps	xmm2, xmm2, 0
-			// Mul each vector with 3 Matrix column
-			mulps	xmm0, [ebx]this.a11
-			mulps	xmm1, [ebx]this.a12
-			mulps	xmm2, [ebx]this.a13
-			// Add each column vector.
-			addps	xmm0, xmm1
-			addps	xmm0, xmm2
-
-			// write the result.
-			movss	[edi]vout.x, xmm0
-			shufps	xmm0, xmm0, 33
-			movss	[edi]vout.y, xmm0
-			movhlps	xmm0, xmm0
-			movss	[edi]vout.z, xmm0
-		}
-	}
-	// mulSetpoint. NB: in should be different as v!! (else don't work).
-	void	mulSetPoint(const CVector &vin, CVector &vout)
-	{
-		__asm
-		{
-			mov		eax, vin
-			mov		ebx, this
-			mov		edi, vout
-			// Load in vector in op[0]
-			movss	xmm0, [eax]vin.x
-			movss	xmm1, [eax]vin.y
-			movss	xmm2, [eax]vin.z
-			// Expand op[0] to op[1], op[2], op[3]
-			shufps	xmm0, xmm0, 0
-			shufps	xmm1, xmm1, 0
-			shufps	xmm2, xmm2, 0
-			// Mul each vector with 3 Matrix column
-			mulps	xmm0, [ebx]this.a11
-			mulps	xmm1, [ebx]this.a12
-			mulps	xmm2, [ebx]this.a13
-			// Add each column vector.
-			addps	xmm0, xmm1
-			addps	xmm0, xmm2
-			// Add Matrix translate column vector
-			addps	xmm0, [ebx]this.a14
-
-			// write the result.
-			movss	[edi]vout.x, xmm0
-			shufps	xmm0, xmm0, 33
-			movss	[edi]vout.y, xmm0
-			movhlps	xmm0, xmm0
-			movss	[edi]vout.z, xmm0
-		}
-	}
-
-
-	// mulSetvector. NB: vin should be different as v!! (else don't work).
-	void	mulSetVector(const CVector &vin, float scale, CVector &vout)
-	{
-		__asm
-		{
-			mov		eax, vin
-			mov		ebx, this
-			mov		edi, vout
-			// Load in vector in op[0]
-			movss	xmm0, [eax]vin.x
-			movss	xmm1, [eax]vin.y
-			movss	xmm2, [eax]vin.z
-			// Load scale in op[0]
-			movss	xmm3, scale
-			// Expand op[0] to op[1], op[2], op[3]
-			shufps	xmm0, xmm0, 0
-			shufps	xmm1, xmm1, 0
-			shufps	xmm2, xmm2, 0
-			shufps	xmm3, xmm3, 0
-			// Store vertex column in other regs.
-			movaps	xmm5, xmm0
-			movaps	xmm6, xmm1
-			movaps	xmm7, xmm2
-			// Mul each vector with 3 Matrix column
-			mulps	xmm0, [ebx]this.a11
-			mulps	xmm1, [ebx]this.a12
-			mulps	xmm2, [ebx]this.a13
-			// Add each column vector.
-			addps	xmm0, xmm1
-			addps	xmm0, xmm2
-
-			// mul final result with scale
-			mulps	xmm0, xmm3
-
-			// store it in xmm4 for future use.
-			movaps	xmm4, xmm0
-		}
-	}
-	// mulSetpoint. NB: vin should be different as v!! (else don't work).
-	void	mulSetPoint(const CVector &vin, float scale, CVector &vout)
-	{
-		__asm
-		{
-			mov		eax, vin
-			mov		ebx, this
-			mov		edi, vout
-			// Load in vector in op[0]
-			movss	xmm0, [eax]vin.x
-			movss	xmm1, [eax]vin.y
-			movss	xmm2, [eax]vin.z
-			// Load scale in op[0]
-			movss	xmm3, scale
-			// Expand op[0] to op[1], op[2], op[3]
-			shufps	xmm0, xmm0, 0
-			shufps	xmm1, xmm1, 0
-			shufps	xmm2, xmm2, 0
-			shufps	xmm3, xmm3, 0
-			// Store vertex column in other regs.
-			movaps	xmm5, xmm0
-			movaps	xmm6, xmm1
-			movaps	xmm7, xmm2
-			// Mul each vector with 3 Matrix column
-			mulps	xmm0, [ebx]this.a11
-			mulps	xmm1, [ebx]this.a12
-			mulps	xmm2, [ebx]this.a13
-			// Add each column vector.
-			addps	xmm0, xmm1
-			addps	xmm0, xmm2
-			// Add Matrix translate column vector
-			addps	xmm0, [ebx]this.a14
-
-			// mul final result with scale
-			mulps	xmm0, xmm3
-
-			// store it in xmm4 for future use.
-			movaps	xmm4, xmm0
-		}
-	}
-
-
-	// mulAddvector. NB: vin should be different as v!! (else don't work).
-	void	mulAddVector(const CVector &/* vin */, float scale, CVector &vout)
-	{
-		__asm
-		{
-			mov		ebx, this
-			mov		edi, vout
-			// Load vin vector loaded in mulSetVector
-			movaps	xmm0, xmm5
-			movaps	xmm1, xmm6
-			movaps	xmm2, xmm7
-			// Load scale in op[0]
-			movss	xmm3, scale
-			// Expand op[0] to op[1], op[2], op[3]
-			shufps	xmm3, xmm3, 0
-			// Mul each vector with 3 Matrix column
-			mulps	xmm0, [ebx]this.a11
-			mulps	xmm1, [ebx]this.a12
-			mulps	xmm2, [ebx]this.a13
-			// Add each column vector.
-			addps	xmm0, xmm1
-			addps	xmm0, xmm2
-
-			// mul final result with scale
-			mulps	xmm0, xmm3
-
-			// Add result, with prec sum.
-			addps	xmm0, xmm4
-
-			// store it in xmm4 for future use.
-			movaps	xmm4, xmm0
-
-			// write the result.
-			movss	[edi]vout.x, xmm0
-			shufps	xmm0, xmm0, 33
-			movss	[edi]vout.y, xmm0
-			movhlps	xmm0, xmm0
-			movss	[edi]vout.z, xmm0
-		}
-	}
-	// mulAddpoint. NB: vin should be different as v!! (else don't work).
-	void	mulAddPoint(const CVector &/* vin */, float scale, CVector &vout)
-	{
-		__asm
-		{
-			mov		ebx, this
-			mov		edi, vout
-			// Load vin vector loaded in mulSetPoint
-			movaps	xmm0, xmm5
-			movaps	xmm1, xmm6
-			movaps	xmm2, xmm7
-			// Load scale in op[0]
-			movss	xmm3, scale
-			// Expand op[0] to op[1], op[2], op[3]
-			shufps	xmm3, xmm3, 0
-			// Mul each vector with 3 Matrix column
-			mulps	xmm0, [ebx]this.a11
-			mulps	xmm1, [ebx]this.a12
-			mulps	xmm2, [ebx]this.a13
-			// Add each column vector.
-			addps	xmm0, xmm1
-			addps	xmm0, xmm2
-			// Add Matrix translate column vector
-			addps	xmm0, [ebx]this.a14
-
-			// mul final result with scale
-			mulps	xmm0, xmm3
-
-			// Add result, with prec sum.
-			addps	xmm0, xmm4
-
-			// store it in xmm4 for future use.
-			movaps	xmm4, xmm0
-
-			// write the result.
-			movss	[edi]vout.x, xmm0
-			shufps	xmm0, xmm0, 33
-			movss	[edi]vout.y, xmm0
-			movhlps	xmm0, xmm0
-			movss	[edi]vout.z, xmm0
-		}
-	}
-
-};
-
-#else // NL_OS_WINDOWS
-/// dummy CMatrix3x4SSE for non windows platform
-class CMatrix3x4SSE : public  CMatrix3x4 { };
-#endif
-
-
-
 } // NL3D


--- a/code/nel/include/nel/misc/fixed_size_allocator.h
+++ b/code/nel/include/nel/misc/fixed_size_allocator.h
@ -53,7 +53,8 @@ public:
 	uint getNumAllocatedBlocks() const { return _NumAlloc; }
 private:
 	class CChunk;
-	class CNode
+	
+	class NL_ALIGN(NL_DEFAULT_MEMORY_ALIGNMENT) CNode
 	{
 	public:
 		CChunk *Chunk; // the Chunk this node belongs to.
--- a/code/nel/include/nel/misc/matrix.h
+++ b/code/nel/include/nel/misc/matrix.h
@ -53,7 +53,8 @@ class	CPlane;
 * \author Nevrax France
 * \date 2000
 */
-class CMatrix
+
+class NL_ALIGN_SSE2 CMatrix
 {
 public:
 	/// Rotation Order.
--- a/code/nel/include/nel/misc/object_vector.h
+++ b/code/nel/include/nel/misc/object_vector.h
@ -29,6 +29,12 @@
 # endif // NLMISC_HEAP_ALLOCATION_NDEBUG
 #endif // NL_USE_DEFAULT_MEMORY_MANAGER

+#ifndef NL_OV_USE_NEW_ALLOCATOR
+# ifdef NL_HAS_SSE2
+#  define NL_OV_USE_NEW_ALLOCATOR
+# endif // NL_HAS_SSE2
+#endif // NL_OV_USE_NEW_ALLOCATOR
+
 namespace NLMISC {


--- a/code/nel/include/nel/misc/types_nl.h
+++ b/code/nel/include/nel/misc/types_nl.h
@ -338,6 +338,56 @@ typedef	unsigned	int			uint;			// at least 32bits (depend of processor)

 #endif // NL_OS_UNIX

+
+// #ifdef NL_ENABLE_FORCE_INLINE
+#	ifdef NL_COMP_VC
+#		define NL_FORCE_INLINE __forceinline
+#	elif defined(NL_COMP_GCC)
+#		define NL_FORCE_INLINE inline __attribute__((always_inline))
+#	else
+#		define NL_FORCE_INLINE inline
+#	endif
+// #else
+// #	define NL_FORCE_INLINE inline
+// #endif
+
+
+#ifdef NL_COMP_VC
+#define NL_ALIGN(nb) __declspec(align(nb))
+#else
+#define NL_ALIGN(nb) __attribute__((aligned(nb)))
+#endif
+
+#ifdef NL_OS_WINDOWS
+#include <stdlib.h>
+#include <intrin.h>
+#include <malloc.h>
+inline void *aligned_malloc(size_t size, size_t alignment) { return _aligned_malloc(size, alignment); }
+inline void aligned_free(void *ptr) { _aligned_free(ptr); }
+#else
+inline void *aligned_malloc(size_t size, size_t alignment) { return memalign(alignment, size); }
+inline void aligned_free(void *ptr) { free(ptr); }
+#endif /* NL_COMP_ */
+
+
+#ifdef NL_HAS_SSE2
+
+#define NL_DEFAULT_MEMORY_ALIGNMENT 16
+#define NL_ALIGN_SSE2 NL_ALIGN(NL_DEFAULT_MEMORY_ALIGNMENT)
+
+extern void *operator new(size_t size) throw(std::bad_alloc);
+extern void *operator new[](size_t size) throw(std::bad_alloc);
+extern void operator delete(void *p) throw();
+extern void operator delete[](void *p) throw();
+
+#else /* NL_HAS_SSE2 */
+
+#define NL_DEFAULT_MEMORY_ALIGNMENT 4
+#define NL_ALIGN_SSE2 
+
+#endif /* NL_HAS_SSE2 */
+
+
 // CHashMap, CHashSet and CHashMultiMap definitions
 #if defined(_STLPORT_VERSION) // STLport detected
 #	include <hash_map>
--- a/code/nel/src/3d/computed_string.cpp
+++ b/code/nel/src/3d/computed_string.cpp
@ -143,11 +143,13 @@ void CComputedString::render2D (IDriver& driver,
 /*------------------------------------------------------------------*\
 							render3D()
 \*------------------------------------------------------------------*/
-void CComputedString::render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot)
+void CComputedString::render3D (IDriver& driver, const CMatrix &matrixp, THotSpot hotspot)
 {
 	if (Vertices.getNumVertices() == 0)
 		return;

+	CMatrix matrix = matrixp;
+
 	// get window size
 	uint32	wndWidth, wndHeight;
 	driver.getWindowSize(wndWidth, wndHeight);
--- a/code/nel/src/3d/mesh_mrm_skin.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin.cpp
@ -39,124 +39,6 @@ namespace NL3D
 {


-// ***************************************************************************
-// ***************************************************************************
-// CMatrix3x4SSE array correctly aligned
-// ***************************************************************************
-// ***************************************************************************
-
-
-
-// ***************************************************************************
-#define	NL3D_SSE_ALIGNEMENT		16
-/**
- *	A CMatrix3x4SSE array correctly aligned
- *	NB: SSE is no more used (no speed gain, some memory problem), but keep it for possible future usage.
- */
-class	CMatrix3x4SSEArray
-{
-private:
-	void	*_AllocData;
-	void	*_Data;
-	uint	_Size;
-	uint	_Capacity;
-
-public:
-	CMatrix3x4SSEArray()
-	{
-		_AllocData= NULL;
-		_Data= NULL;
-		_Size= 0;
-		_Capacity= 0;
-	}
-	~CMatrix3x4SSEArray()
-	{
-		clear();
-	}
-	CMatrix3x4SSEArray(const CMatrix3x4SSEArray &other)
-	{
-		_AllocData= NULL;
-		_Data= NULL;
-		_Size= 0;
-		_Capacity= 0;
-		*this= other;
-	}
-	CMatrix3x4SSEArray &operator=(const CMatrix3x4SSEArray &other)
-	{
-		if( this == &other)
-			return *this;
-		resize(other.size());
-		// copy data from aligned pointers to aligned pointers.
-		memcpy(_Data, other._Data, size() * sizeof(CMatrix3x4SSE) );
-
-		return *this;
-	}
-
-
-	CMatrix3x4SSE	*getPtr()
-	{
-		return (CMatrix3x4SSE*)_Data;
-	}
-
-	void	clear()
-	{
-		delete [] ((uint8 *)_AllocData);
-		_AllocData= NULL;
-		_Data= NULL;
-		_Size= 0;
-		_Capacity= 0;
-	}
-
-	void	resize(uint n)
-	{
-		// reserve ??
-		if(n>_Capacity)
-			reserve( max(2*_Capacity, n));
-		_Size= n;
-	}
-
-	void	reserve(uint n)
-	{
-		if(n==0)
-			clear();
-		else if(n>_Capacity)
-		{
-			// Alloc new data.
-			void	*newAllocData;
-			void	*newData;
-
-			// Alloc for alignement.
-			newAllocData= new uint8 [n * sizeof(CMatrix3x4SSE) + NL3D_SSE_ALIGNEMENT-1];
-			if(newAllocData==NULL)
-				throw Exception("SSE Allocation Failed");
-
-			// Align ptr
-			newData= (void*) ( ((ptrdiff_t)newAllocData+NL3D_SSE_ALIGNEMENT-1) & (~(NL3D_SSE_ALIGNEMENT-1)) );
-
-			// copy valid data from old to new.
-			memcpy(newData, _Data, size() * sizeof(CMatrix3x4SSE) );
-
-			// release old.
-			if(_AllocData)
-				delete [] ((uint8*)_AllocData);
-
-			// change ptrs and capacity.
-			_Data= newData;
-			_AllocData= newAllocData;
-			_Capacity= n;
-
-			// TestYoyo
-			//nlwarning("YOYO Tst SSE P4: %X, %d", _Data, n);
-		}
-	}
-
-	uint	size() const {return _Size;}
-
-
-	CMatrix3x4SSE	&operator[](uint i) {return ((CMatrix3x4SSE*)_Data)[i];}
-};
-
-

 // ***************************************************************************
 // ***************************************************************************
--- a/code/nel/src/3d/mesh_mrm_skin_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp
@ -39,7 +39,23 @@ static void	applyArraySkinNormalT(uint numMatrixes, uint32 *infPtr, CMesh::CSkin
 {
 	/* Prefetch all vertex/normal before, it is to be faster.
 	*/
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#ifdef NL_HAS_SSE2
+	{
+		uint	nInfTmp= nInf;
+		uint32	*infTmpPtr= infPtr;
+		for(;nInfTmp>0;nInfTmp--, infTmpPtr++)
+		{
+			uint	index= *infTmpPtr;
+			CMesh::CSkinWeight	*srcSkin= srcSkinPtr + index;
+			CVector				*srcVertex= srcVertexPtr + index;
+			CVector				*srcNormal= srcNormalPtr + index;
+
+			_mm_prefetch((const char *)(void *)srcSkin, _MM_HINT_T1);
+			_mm_prefetch((const char *)(void *)srcVertex, _MM_HINT_T1);
+			_mm_prefetch((const char *)(void *)srcNormal, _MM_HINT_T1);
+		}
+	}
+#elif defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
 	{
 		uint	nInfTmp= nInf;
 		uint32	*infTmpPtr= infPtr;
@ -176,7 +192,25 @@ static void	applyArraySkinTangentSpaceT(uint numMatrixes, uint32 *infPtr, CMesh:
 {
 	/* Prefetch all vertex/normal/tgSpace before, it is faster.
 	*/
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#ifdef NL_HAS_SSE2
+	{
+		uint	nInfTmp= nInf;
+		uint32	*infTmpPtr= infPtr;
+		for(;nInfTmp>0;nInfTmp--, infTmpPtr++)
+		{
+			uint	index= *infTmpPtr;
+			CMesh::CSkinWeight	*srcSkin= srcSkinPtr + index;
+			CVector				*srcVertex= srcVertexPtr + index;
+			CVector				*srcNormal= srcNormalPtr + index;
+			CVector				*srcTgSpace= tgSpacePtr + index;
+
+			_mm_prefetch((const char *)(void *)srcSkin, _MM_HINT_T1);
+			_mm_prefetch((const char *)(void *)srcVertex, _MM_HINT_T1);
+			_mm_prefetch((const char *)(void *)srcNormal, _MM_HINT_T1);
+			_mm_prefetch((const char *)(void *)srcTgSpace, _MM_HINT_T1);
+		}
+	}
+#elif defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
 	{
 		uint	nInfTmp= nInf;
 		uint32	*infTmpPtr= infPtr;
--- a/code/nel/src/3d/mesh_mrm_skinned.cpp
+++ b/code/nel/src/3d/mesh_mrm_skinned.cpp
@ -2247,123 +2247,6 @@ void CMeshMRMSkinnedGeom::getSkinWeights (std::vector<CMesh::CSkinWeight> &skinW
 	}
 }

-// ***************************************************************************
-// ***************************************************************************
-// CMatrix3x4SSE array correctly aligned
-// ***************************************************************************
-// ***************************************************************************
-
-
-
-// ***************************************************************************
-#define	NL3D_SSE_ALIGNEMENT		16
-/**
- *	A CMatrix3x4SSEArray array correctly aligned
- *	NB: SSE is no more used (no speed gain, some memory problem), but keep it for possible future usage.
- */
-class	CMatrix3x4SSEArray
-{
-private:
-	void	*_AllocData;
-	void	*_Data;
-	uint	_Size;
-	uint	_Capacity;
-
-public:
-	CMatrix3x4SSEArray()
-	{
-		_AllocData= NULL;
-		_Data= NULL;
-		_Size= 0;
-		_Capacity= 0;
-	}
-	~CMatrix3x4SSEArray()
-	{
-		clear();
-	}
-	CMatrix3x4SSEArray(const CMatrix3x4SSEArray &other)
-	{
-		_AllocData= NULL;
-		_Data= NULL;
-		_Size= 0;
-		_Capacity= 0;
-		*this= other;
-	}
-	CMatrix3x4SSEArray &operator=(const CMatrix3x4SSEArray &other)
-	{
-		if( this == &other)
-			return *this;
-		resize(other.size());
-		// copy data from aligned pointers to aligned pointers.
-		memcpy(_Data, other._Data, size() * sizeof(CMatrix3x4SSE) );
-
-		return *this;
-	}
-
-
-	CMatrix3x4SSE	*getPtr()
-	{
-		return (CMatrix3x4SSE*)_Data;
-	}
-
-	void	clear()
-	{
-		delete [] ((uint8 *) _AllocData);
-		_AllocData= NULL;
-		_Data= NULL;
-		_Size= 0;
-		_Capacity= 0;
-	}
-
-	void	resize(uint n)
-	{
-		// reserve ??
-		if(n>_Capacity)
-			reserve( max(2*_Capacity, n));
-		_Size= n;
-	}
-
-	void	reserve(uint n)
-	{
-		if(n==0)
-			clear();
-		else if(n>_Capacity)
-		{
-			// Alloc new data.
-			void	*newAllocData;
-			void	*newData;
-
-			// Alloc for alignement.
-			newAllocData= new uint8 [n * sizeof(CMatrix3x4SSE) + NL3D_SSE_ALIGNEMENT-1];
-			if(newAllocData==NULL)
-				throw Exception("SSE Allocation Failed");
-
-			// Align ptr
-			newData= (void*) ( ((ptrdiff_t)newAllocData+NL3D_SSE_ALIGNEMENT-1) & (~(NL3D_SSE_ALIGNEMENT-1)) );
-
-			// copy valid data from old to new.
-			memcpy(newData, _Data, size() * sizeof(CMatrix3x4SSE) );
-
-			// release old.
-			if(_AllocData)
-				delete [] ((uint8*)_AllocData);
-
-			// change ptrs and capacity.
-			_Data= newData;
-			_AllocData= newAllocData;
-			_Capacity= n;
-
-			// TestYoyo
-			//nlwarning("YOYO Tst SSE P4: %X, %d", _Data, n);
-		}
-	}
-
-	uint	size() const {return _Size;}
-
-
-	CMatrix3x4SSE	&operator[](uint i) {return ((CMatrix3x4SSE*)_Data)[i];}
-};
-

 // ***************************************************************************
 // ***************************************************************************
--- a/code/nel/src/misc/common.cpp
+++ b/code/nel/src/misc/common.cpp
@ -75,6 +75,35 @@ extern "C" long _ftol2( double dblSource ) { return _ftol( dblSource ); }
 #endif // !NL_COMP_MINGW


+#ifdef NL_HAS_SSE2
+
+void *operator new(size_t size) throw(std::bad_alloc)
+{
+	void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
+	if (p == NULL) throw std::bad_alloc();
+	return p;
+}
+
+void *operator new[](size_t size) throw(std::bad_alloc)
+{
+	void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
+	if (p == NULL) throw std::bad_alloc();
+	return p;
+}
+
+void operator delete(void *p) throw()
+{
+	aligned_free(p);
+}
+
+void operator delete[](void *p) throw()
+{
+	aligned_free(p);
+}
+
+#endif /* NL_HAS_SSE2 */
+
+
 #ifdef DEBUG_NEW
 	#define new DEBUG_NEW
 #endif
--- a/code/nel/src/misc/fixed_size_allocator.cpp
+++ b/code/nel/src/misc/fixed_size_allocator.cpp
@ -33,6 +33,9 @@ CFixedSizeAllocator::CFixedSizeAllocator(uint numBytesPerBlock, uint numBlockPer
 	_NumChunks = 0;
 	nlassert(numBytesPerBlock > 1);
 	_NumBytesPerBlock = numBytesPerBlock;
+	const uint mask = NL_DEFAULT_MEMORY_ALIGNMENT - 1;
+	_NumBytesPerBlock = (_NumBytesPerBlock + mask) & ~mask;
+	nlassert(_NumBytesPerBlock >= numBytesPerBlock);
 	_NumBlockPerChunk = std::max(numBlockPerChunk, (uint) 3);
 	_NumAlloc = 0;
 }
@ -67,12 +70,14 @@ void *CFixedSizeAllocator::alloc()
 	return _FreeSpace->unlink();
 }

+#define aligned_offsetof(s, m) ((offsetof(s, m) + (NL_DEFAULT_MEMORY_ALIGNMENT - 1)) & ~(NL_DEFAULT_MEMORY_ALIGNMENT - 1))
+
 // *****************************************************************************************************************
 void CFixedSizeAllocator::free(void *block)
 {
 	if (!block) return;
 	/// get the node from the object
-	CNode *node = (CNode *) ((uint8 *) block - offsetof(CNode, Next));
+	CNode *node = (CNode *) ((uint8 *) block - aligned_offsetof(CNode, Next));
 	//
 	nlassert(node->Chunk != NULL);
 	nlassert(node->Chunk->Allocator == this);
@ -84,7 +89,9 @@ void CFixedSizeAllocator::free(void *block)
 // *****************************************************************************************************************
 uint CFixedSizeAllocator::CChunk::getBlockSizeWithOverhead() const
 {
-	return std::max((uint)(sizeof(CNode) - offsetof(CNode, Next)),(uint)(Allocator->getNumBytesPerBlock())) + offsetof(CNode, Next);
+	nlctassert((sizeof(CNode) % NL_DEFAULT_MEMORY_ALIGNMENT) == 0);
+	return std::max((uint)(sizeof(CNode) - aligned_offsetof(CNode, Next)),
+		(uint)(Allocator->getNumBytesPerBlock())) + aligned_offsetof(CNode, Next);
 }

 // *****************************************************************************************************************
@ -105,7 +112,7 @@ CFixedSizeAllocator::CChunk::~CChunk()
 	nlassert(NumFreeObjs == 0);
 	nlassert(Allocator->_NumChunks > 0);
 	-- (Allocator->_NumChunks);
-	delete[] Mem;
+	aligned_free(Mem); //delete[] Mem;
 }

 // *****************************************************************************************************************
@ -115,7 +122,7 @@ void CFixedSizeAllocator::CChunk::init(CFixedSizeAllocator *alloc)
 	nlassert(alloc != NULL);
 	Allocator = alloc;
 	//
-	Mem = new uint8[getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk()];
+	Mem = (uint8 *)aligned_malloc(getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk(), NL_DEFAULT_MEMORY_ALIGNMENT); // new uint8[getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk()];
 	//
 	getNode(0).Chunk = this;
 	getNode(0).Next = &getNode(1);
@ -179,7 +186,7 @@ void *CFixedSizeAllocator::CNode::unlink()
 	*Prev = Next;
 	nlassert(Chunk->NumFreeObjs > 0);
 	Chunk->grab(); // tells the containing chunk that a node has been allocated
-	return (void *) &Next;
+	return (void *)((uintptr_t)(this) + aligned_offsetof(CNode, Next)); //(void *) &Next;
 }

 // *****************************************************************************************************************
--- a/code/nel/src/misc/matrix.cpp
+++ b/code/nel/src/misc/matrix.cpp
@ -140,6 +140,7 @@ inline void CMatrix::testExpandRot() const
 		self->Scale33= 1;
 	}
 }
+
 inline void CMatrix::testExpandProj() const
 {
 	if(hasProj())
--- a/code/nel/src/misc/object_arena_allocator.cpp
+++ b/code/nel/src/misc/object_arena_allocator.cpp
@ -68,21 +68,23 @@ void *CObjectArenaAllocator::alloc(uint size)
 	if (size >= _MaxAllocSize)
 	{
 		// use standard allocator
-		uint8 *block = new uint8[size + sizeof(uint)]; // an additionnal uint is needed to store size of block
+		nlctassert(NL_DEFAULT_MEMORY_ALIGNMENT > sizeof(uint));
+		uint8 *block = (uint8 *)aligned_malloc(NL_DEFAULT_MEMORY_ALIGNMENT + size, NL_DEFAULT_MEMORY_ALIGNMENT); //new uint8[size + sizeof(uint)]; // an additionnal uint is needed to store size of block
 		if (!block) return NULL;
 		#ifdef NL_DEBUG
 			_MemBlockToAllocID[block] = _AllocID;
 		#endif
 		*(uint *) block = size;
-		return block + sizeof(uint);
+		return block + NL_DEFAULT_MEMORY_ALIGNMENT;
 	}
 	uint entry = ((size + (_Granularity - 1)) / _Granularity) ;
 	nlassert(entry < _ObjectSizeToAllocator.size());
 	if (!_ObjectSizeToAllocator[entry])
 	{
-		_ObjectSizeToAllocator[entry] = new CFixedSizeAllocator(entry * _Granularity + sizeof(uint), _MaxAllocSize / size); // an additionnal uint is needed to store size of block
+		_ObjectSizeToAllocator[entry] = new CFixedSizeAllocator(entry * _Granularity + NL_DEFAULT_MEMORY_ALIGNMENT, _MaxAllocSize / size); // an additionnal uint is needed to store size of block
 	}
 	void *block = _ObjectSizeToAllocator[entry]->alloc();
+	nlassert(((uintptr_t)block % NL_DEFAULT_MEMORY_ALIGNMENT) == 0);
 	#ifdef NL_DEBUG
 		if (block)
 		{
@ -91,14 +93,14 @@ void *CObjectArenaAllocator::alloc(uint size)
 		++_AllocID;
 	#endif
 	*(uint *) block = size;
-	return (void *) ((uint8 *) block + sizeof(uint));
+	return (void *) ((uint8 *) block + NL_DEFAULT_MEMORY_ALIGNMENT);
 }

 // *****************************************************************************************************************
 void CObjectArenaAllocator::free(void *block)
 {
 	if (!block) return;
-	uint8 *realBlock = (uint8 *) block - sizeof(uint); // a uint is used at start of block to give its size
+	uint8 *realBlock = (uint8 *) block - NL_DEFAULT_MEMORY_ALIGNMENT; // sizeof(uint); // a uint is used at start of block to give its size
 	uint size = *(uint *) realBlock;
 	if (size >= _MaxAllocSize)
 	{
@ -107,7 +109,7 @@ void CObjectArenaAllocator::free(void *block)
 				nlassert(it != _MemBlockToAllocID.end());
 				_MemBlockToAllocID.erase(it);
 		#endif
-		delete realBlock;
+		aligned_free(realBlock);
 		return;
 	}
 	uint entry = ((size + (_Granularity - 1)) / _Granularity);