From: divverent Date: Fri, 8 Oct 2010 17:54:16 +0000 (+0000) Subject: SSE patch by kyre, with runtime CPU detection and a cvar r_skeletal_use_sse if SSE... X-Git-Tag: xonotic-v0.1.0preview~56^2~100 X-Git-Url: https://git.rm.cloudns.org/?a=commitdiff_plain;h=8823e14b9482ce5779c09a48e9b81f397f94b55f;p=xonotic%2Fdarkplaces.git SSE patch by kyre, with runtime CPU detection and a cvar r_skeletal_use_sse if SSE is detected git-svn-id: svn://svn.icculus.org/twilight/trunk/darkplaces@10517 d7cf8633-e32d-0410-b094-e92efae38249 --- diff --git a/makefile.inc b/makefile.inc index 3f41d462..b98e2b25 100644 --- a/makefile.inc +++ b/makefile.inc @@ -138,6 +138,8 @@ OBJ_COMMON= \ mdfour.o \ menu.o \ meshqueue.o \ + mod_skeletal_animatevertices_sse.o \ + mod_skeletal_animatevertices_generic.o \ model_alias.o \ model_brush.o \ model_shared.o \ @@ -188,6 +190,8 @@ CFLAGS_RELEASE= CFLAGS_RELEASE_PROFILE=-fbranch-probabilities CFLAGS_SDL=$(SDLCONFIG_CFLAGS) +CFLAGS_SSE=-msse + OPTIM_DEBUG=$(CPUOPTIMIZATIONS) #OPTIM_RELEASE=-O2 -fno-strict-aliasing -ffast-math -funroll-loops $(CPUOPTIMIZATIONS) #OPTIM_RELEASE=-O2 -fno-strict-aliasing -fno-math-errno -fno-trapping-math -ffinite-math-only -fno-signaling-nans -fcx-limited-range -funroll-loops $(CPUOPTIMIZATIONS) @@ -513,6 +517,10 @@ cd_sdl.o: cd_sdl.c $(CHECKLEVEL2) $(DO_CC) $(CFLAGS_SDL) +mod_skeletal_animatevertices_sse.o: mod_skeletal_animatevertices_sse.c + $(CHECKLEVEL2) + $(DO_CC) $(CFLAGS_SSE) + darkplaces.o: %.o : %.rc $(CHECKLEVEL2) $(WINDRES) -o $@ $< diff --git a/mod_skeletal_animatevertices_generic.c b/mod_skeletal_animatevertices_generic.c new file mode 100644 index 00000000..24cc8a9e --- /dev/null +++ b/mod_skeletal_animatevertices_generic.c @@ -0,0 +1,213 @@ +#include "mod_skeletal_animatevertices_generic.h" + +typedef struct +{ + float f[12]; +} +float12_t; + +void Mod_Skeletal_AnimateVertices_Generic(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) +{ + // vertex weighted skeletal + int i, k; + int blends; + float12_t *bonepose; + float12_t *boneposerelative; + float m[12]; + const blendweights_t * RESTRICT weights; + + if (!model->surfmesh.num_vertices) + return; + + //unsigned long long ts = rdtsc(); + bonepose = (float12_t *) Mod_Skeletal_AnimateVertices_AllocBuffers(sizeof(float12_t) * (model->num_bones*2 + model->surfmesh.num_blends)); + boneposerelative = bonepose + model->num_bones; + + if (skeleton && !skeleton->relativetransforms) + skeleton = NULL; + + // interpolate matrices + if (skeleton) + { + for (i = 0;i < model->num_bones;i++) + { + Matrix4x4_ToArray12FloatD3D(&skeleton->relativetransforms[i], m); + if (model->data_bones[i].parent >= 0) + R_ConcatTransforms(bonepose[model->data_bones[i].parent].f, m, bonepose[i].f); + else + memcpy(bonepose[i].f, m, sizeof(m)); + + // create a relative deformation matrix to describe displacement + // from the base mesh, which is used by the actual weighting + R_ConcatTransforms(bonepose[i].f, model->data_baseboneposeinverse + i * 12, boneposerelative[i].f); + } + } + else + { + float originscale = model->num_posescale; + float x,y,z,w,lerp; + const short * RESTRICT pose6s; + + for (i = 0;i < model->num_bones;i++) + { + memset(m, 0, sizeof(m)); + for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++) + { + pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i); + lerp = frameblend[blends].lerp; + x = pose6s[3] * (1.0f / 32767.0f); + y = pose6s[4] * (1.0f / 32767.0f); + z = pose6s[5] * (1.0f / 32767.0f); + w = 1.0f - (x*x+y*y+z*z); + w = w > 0.0f ? -sqrt(w) : 0.0f; + m[ 0] += (1-2*(y*y+z*z)) * lerp; + m[ 1] += ( 2*(x*y-z*w)) * lerp; + m[ 2] += ( 2*(x*z+y*w)) * lerp; + m[ 3] += (pose6s[0] * originscale) * lerp; + m[ 4] += ( 2*(x*y+z*w)) * lerp; + m[ 5] += (1-2*(x*x+z*z)) * lerp; + m[ 6] += ( 2*(y*z-x*w)) * lerp; + m[ 7] += (pose6s[1] * originscale) * lerp; + m[ 8] += ( 2*(x*z-y*w)) * lerp; + m[ 9] += ( 2*(y*z+x*w)) * lerp; + m[10] += (1-2*(x*x+y*y)) * lerp; + m[11] += (pose6s[2] * originscale) * lerp; + } + VectorNormalize(m ); + VectorNormalize(m + 4); + VectorNormalize(m + 8); + if (i == r_skeletal_debugbone.integer) + m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value; + m[3] *= r_skeletal_debugtranslatex.value; + m[7] *= r_skeletal_debugtranslatey.value; + m[11] *= r_skeletal_debugtranslatez.value; + if (model->data_bones[i].parent >= 0) + R_ConcatTransforms(bonepose[model->data_bones[i].parent].f, m, bonepose[i].f); + else + memcpy(bonepose[i].f, m, sizeof(m)); + // create a relative deformation matrix to describe displacement + // from the base mesh, which is used by the actual weighting + R_ConcatTransforms(bonepose[i].f, model->data_baseboneposeinverse + i * 12, boneposerelative[i].f); + } + } + + // generate matrices for all blend combinations + weights = model->surfmesh.data_blendweights; + for (i = 0;i < model->surfmesh.num_blends;i++, weights++) + { + float * RESTRICT b = boneposerelative[model->num_bones + i].f; + const float * RESTRICT m = boneposerelative[weights->index[0]].f; + float f = weights->influence[0] * (1.0f / 255.0f); + b[ 0] = f*m[ 0]; b[ 1] = f*m[ 1]; b[ 2] = f*m[ 2]; b[ 3] = f*m[ 3]; + b[ 4] = f*m[ 4]; b[ 5] = f*m[ 5]; b[ 6] = f*m[ 6]; b[ 7] = f*m[ 7]; + b[ 8] = f*m[ 8]; b[ 9] = f*m[ 9]; b[10] = f*m[10]; b[11] = f*m[11]; + for (k = 1;k < 4 && weights->influence[k];k++) + { + m = boneposerelative[weights->index[k]].f; + f = weights->influence[k] * (1.0f / 255.0f); + b[ 0] += f*m[ 0]; b[ 1] += f*m[ 1]; b[ 2] += f*m[ 2]; b[ 3] += f*m[ 3]; + b[ 4] += f*m[ 4]; b[ 5] += f*m[ 5]; b[ 6] += f*m[ 6]; b[ 7] += f*m[ 7]; + b[ 8] += f*m[ 8]; b[ 9] += f*m[ 9]; b[10] += f*m[10]; b[11] += f*m[11]; + } + } + +#define LOAD_MATRIX_SCALAR() const float * RESTRICT m = boneposerelative[*b].f + +#define LOAD_MATRIX3() \ + LOAD_MATRIX_SCALAR() +#define LOAD_MATRIX4() \ + LOAD_MATRIX_SCALAR() + +#define TRANSFORM_POSITION_SCALAR(in, out) \ + (out)[0] = ((in)[0] * m[0] + (in)[1] * m[1] + (in)[2] * m[ 2] + m[3]); \ + (out)[1] = ((in)[0] * m[4] + (in)[1] * m[5] + (in)[2] * m[ 6] + m[7]); \ + (out)[2] = ((in)[0] * m[8] + (in)[1] * m[9] + (in)[2] * m[10] + m[11]); +#define TRANSFORM_VECTOR_SCALAR(in, out) \ + (out)[0] = ((in)[0] * m[0] + (in)[1] * m[1] + (in)[2] * m[ 2]); \ + (out)[1] = ((in)[0] * m[4] + (in)[1] * m[5] + (in)[2] * m[ 6]); \ + (out)[2] = ((in)[0] * m[8] + (in)[1] * m[9] + (in)[2] * m[10]); + +#define TRANSFORM_POSITION(in, out) \ + TRANSFORM_POSITION_SCALAR(in, out) +#define TRANSFORM_VECTOR(in, out) \ + TRANSFORM_VECTOR_SCALAR(in, out) + + // transform vertex attributes by blended matrices + if (vertex3f) + { + const float * RESTRICT v = model->surfmesh.data_vertex3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + // special case common combinations of attributes to avoid repeated loading of matrices + if (normal3f) + { + const float * RESTRICT n = model->surfmesh.data_normal3f; + if (svector3f && tvector3f) + { + const float * RESTRICT sv = model->surfmesh.data_svector3f; + const float * RESTRICT tv = model->surfmesh.data_tvector3f; + + // Note that for SSE each iteration stores one element past end, so we break one vertex short + // and handle that with scalars in that case + for (i = 0; i < model->surfmesh.num_vertices; i++, v += 3, n += 3, sv += 3, tv += 3, b++, + vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3) + { + LOAD_MATRIX4(); + TRANSFORM_POSITION(v, vertex3f); + TRANSFORM_VECTOR(n, normal3f); + TRANSFORM_VECTOR(sv, svector3f); + TRANSFORM_VECTOR(tv, tvector3f); + } + + return; + } + + for (i = 0;i < model->surfmesh.num_vertices; i++, v += 3, n += 3, b++, vertex3f += 3, normal3f += 3) + { + LOAD_MATRIX4(); + TRANSFORM_POSITION(v, vertex3f); + TRANSFORM_VECTOR(n, normal3f); + } + } + else + { + for (i = 0;i < model->surfmesh.num_vertices; i++, v += 3, b++, vertex3f += 3) + { + LOAD_MATRIX4(); + TRANSFORM_POSITION(v, vertex3f); + } + } + } + + else if (normal3f) + { + const float * RESTRICT n = model->surfmesh.data_normal3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + for (i = 0; i < model->surfmesh.num_vertices; i++, n += 3, b++, normal3f += 3) + { + LOAD_MATRIX3(); + TRANSFORM_VECTOR(n, normal3f); + } + } + + if (svector3f) + { + const float * RESTRICT sv = model->surfmesh.data_svector3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + for (i = 0; i < model->surfmesh.num_vertices; i++, sv += 3, b++, svector3f += 3) + { + LOAD_MATRIX3(); + TRANSFORM_VECTOR(sv, svector3f); + } + } + + if (tvector3f) + { + const float * RESTRICT tv = model->surfmesh.data_tvector3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + for (i = 0; i < model->surfmesh.num_vertices; i++, tv += 3, b++, tvector3f += 3) + { + LOAD_MATRIX3(); + TRANSFORM_VECTOR(tv, tvector3f); + } + } +} diff --git a/mod_skeletal_animatevertices_generic.h b/mod_skeletal_animatevertices_generic.h new file mode 100644 index 00000000..2ad97eb6 --- /dev/null +++ b/mod_skeletal_animatevertices_generic.h @@ -0,0 +1,8 @@ +#ifndef MOD_SKELETAL_ANIMATEVERTICES_GENERIC_H +#define MOD_H + +#include "quakedef.h" + +void Mod_Skeletal_AnimateVertices_Generic(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f); + +#endif diff --git a/mod_skeletal_animatevertices_sse.c b/mod_skeletal_animatevertices_sse.c new file mode 100644 index 00000000..d6f71f1a --- /dev/null +++ b/mod_skeletal_animatevertices_sse.c @@ -0,0 +1,329 @@ +#include "mod_skeletal_animatevertices_sse.h" + +#ifdef SSE_POSSIBLE + +#ifdef MATRIX4x4_OPENGLORIENTATION +#error "SSE skeletal requires D3D matrix layout" +#endif + +#include + +void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) +{ + // vertex weighted skeletal + int i, k; + int blends; + matrix4x4_t *bonepose; + matrix4x4_t *boneposerelative; + float m[12]; + matrix4x4_t mm, mm2; + const blendweights_t * RESTRICT weights; + int num_vertices_minus_one; + + if (!model->surfmesh.num_vertices) + return; + + num_vertices_minus_one = model->surfmesh.num_vertices - 1; + + //unsigned long long ts = rdtsc(); + bonepose = (matrix4x4_t *) Mod_Skeletal_AnimateVertices_AllocBuffers(sizeof(matrix4x4_t) * (model->num_bones*2 + model->surfmesh.num_blends)); + boneposerelative = bonepose + model->num_bones; + + if (skeleton && !skeleton->relativetransforms) + skeleton = NULL; + + // interpolate matrices + if (skeleton) + { + for (i = 0;i < model->num_bones;i++) + { + // relativetransforms is in GL column-major order, which is what we need for SSE + // transposed style processing + if (model->data_bones[i].parent >= 0) + Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &skeleton->relativetransforms[i]); + else + memcpy(&bonepose[i], &skeleton->relativetransforms[i], sizeof(matrix4x4_t)); + + // create a relative deformation matrix to describe displacement + // from the base mesh, which is used by the actual weighting + Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major + Matrix4x4_Concat(&boneposerelative[i], &bonepose[i], &mm); + } + } + else + { + float originscale = model->num_posescale; + float x,y,z,w,lerp; + const short * RESTRICT pose6s; + + for (i = 0;i < model->num_bones;i++) + { + memset(m, 0, sizeof(m)); + for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++) + { + pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i); + lerp = frameblend[blends].lerp; + x = pose6s[3] * (1.0f / 32767.0f); + y = pose6s[4] * (1.0f / 32767.0f); + z = pose6s[5] * (1.0f / 32767.0f); + w = 1.0f - (x*x+y*y+z*z); + w = w > 0.0f ? -sqrt(w) : 0.0f; + m[ 0] += (1-2*(y*y+z*z)) * lerp; + m[ 1] += ( 2*(x*y-z*w)) * lerp; + m[ 2] += ( 2*(x*z+y*w)) * lerp; + m[ 3] += (pose6s[0] * originscale) * lerp; + m[ 4] += ( 2*(x*y+z*w)) * lerp; + m[ 5] += (1-2*(x*x+z*z)) * lerp; + m[ 6] += ( 2*(y*z-x*w)) * lerp; + m[ 7] += (pose6s[1] * originscale) * lerp; + m[ 8] += ( 2*(x*z-y*w)) * lerp; + m[ 9] += ( 2*(y*z+x*w)) * lerp; + m[10] += (1-2*(x*x+y*y)) * lerp; + m[11] += (pose6s[2] * originscale) * lerp; + } + VectorNormalize(m ); + VectorNormalize(m + 4); + VectorNormalize(m + 8); + if (i == r_skeletal_debugbone.integer) + m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value; + m[3] *= r_skeletal_debugtranslatex.value; + m[7] *= r_skeletal_debugtranslatey.value; + m[11] *= r_skeletal_debugtranslatez.value; + Matrix4x4_FromArray12FloatD3D(&mm, m); + if (model->data_bones[i].parent >= 0) + Matrix4x4_Concat(&bonepose[i], &bonepose[model->data_bones[i].parent], &mm); + else + memcpy(&bonepose[i], &mm, sizeof(mm)); + // create a relative deformation matrix to describe displacement + // from the base mesh, which is used by the actual weighting + Matrix4x4_FromArray12FloatD3D(&mm, model->data_baseboneposeinverse + i * 12); // baseboneposeinverse is 4x3 row-major + Matrix4x4_Concat(&mm2, &bonepose[i], &mm); + Matrix4x4_Transpose(&boneposerelative[i], &mm2); // TODO: Eliminate this transpose + } + } + + // generate matrices for all blend combinations + weights = model->surfmesh.data_blendweights; + for (i = 0;i < model->surfmesh.num_blends;i++, weights++) + { + float * RESTRICT b = &boneposerelative[model->num_bones + i].m[0][0]; + const float * RESTRICT m = &boneposerelative[weights->index[0]].m[0][0]; + float f = weights->influence[0] * (1.0f / 255.0f); + __m128 fv = _mm_set_ps1(f); + __m128 b0 = _mm_load_ps(m); + __m128 b1 = _mm_load_ps(m+4); + __m128 b2 = _mm_load_ps(m+8); + __m128 b3 = _mm_load_ps(m+12); + __m128 m0, m1, m2, m3; + b0 = _mm_mul_ps(b0, fv); + b1 = _mm_mul_ps(b1, fv); + b2 = _mm_mul_ps(b2, fv); + b3 = _mm_mul_ps(b3, fv); + for (k = 1;k < 4 && weights->influence[k];k++) + { + m = &boneposerelative[weights->index[k]].m[0][0]; + f = weights->influence[k] * (1.0f / 255.0f); + fv = _mm_set_ps1(f); + m0 = _mm_load_ps(m); + m1 = _mm_load_ps(m+4); + m2 = _mm_load_ps(m+8); + m3 = _mm_load_ps(m+12); + m0 = _mm_mul_ps(m0, fv); + m1 = _mm_mul_ps(m1, fv); + m2 = _mm_mul_ps(m2, fv); + m3 = _mm_mul_ps(m3, fv); + b0 = _mm_add_ps(m0, b0); + b1 = _mm_add_ps(m1, b1); + b2 = _mm_add_ps(m2, b2); + b3 = _mm_add_ps(m3, b3); + } + _mm_store_ps(b, b0); + _mm_store_ps(b+4, b1); + _mm_store_ps(b+8, b2); + _mm_store_ps(b+12, b3); + } + +#define LOAD_MATRIX_SCALAR() const float * RESTRICT m = &boneposerelative[*b].m[0][0] + +#define LOAD_MATRIX3() \ + const float * RESTRICT m = &boneposerelative[*b].m[0][0]; \ + /* bonepose array is 16 byte aligned */ \ + __m128 m1 = _mm_load_ps((m)); \ + __m128 m2 = _mm_load_ps((m)+4); \ + __m128 m3 = _mm_load_ps((m)+8); +#define LOAD_MATRIX4() \ + const float * RESTRICT m = &boneposerelative[*b].m[0][0]; \ + /* bonepose array is 16 byte aligned */ \ + __m128 m1 = _mm_load_ps((m)); \ + __m128 m2 = _mm_load_ps((m)+4); \ + __m128 m3 = _mm_load_ps((m)+8); \ + __m128 m4 = _mm_load_ps((m)+12) + + /* Note that matrix is 4x4 and transposed compared to non-USE_SSE codepath */ +#define TRANSFORM_POSITION_SCALAR(in, out) \ + (out)[0] = ((in)[0] * m[0] + (in)[1] * m[4] + (in)[2] * m[ 8] + m[12]); \ + (out)[1] = ((in)[0] * m[1] + (in)[1] * m[5] + (in)[2] * m[ 9] + m[13]); \ + (out)[2] = ((in)[0] * m[2] + (in)[1] * m[6] + (in)[2] * m[10] + m[14]); +#define TRANSFORM_VECTOR_SCALAR(in, out) \ + (out)[0] = ((in)[0] * m[0] + (in)[1] * m[4] + (in)[2] * m[ 8]); \ + (out)[1] = ((in)[0] * m[1] + (in)[1] * m[5] + (in)[2] * m[ 9]); \ + (out)[2] = ((in)[0] * m[2] + (in)[1] * m[6] + (in)[2] * m[10]); + +#define TRANSFORM_POSITION(in, out) { \ + __m128 pin = _mm_loadu_ps(in); /* we ignore the value in the last element (x from the next vertex) */ \ + __m128 x = _mm_shuffle_ps(pin, pin, 0x0); \ + __m128 t1 = _mm_mul_ps(x, m1); \ + \ + /* y, + x */ \ + __m128 y = _mm_shuffle_ps(pin, pin, 0x55); \ + __m128 t2 = _mm_mul_ps(y, m2); \ + __m128 t3 = _mm_add_ps(t1, t2); \ + \ + /* z, + (y+x) */ \ + __m128 z = _mm_shuffle_ps(pin, pin, 0xaa); \ + __m128 t4 = _mm_mul_ps(z, m3); \ + __m128 t5 = _mm_add_ps(t3, t4); \ + \ + /* + m3 */ \ + __m128 pout = _mm_add_ps(t5, m4); \ + _mm_storeu_ps((out), pout); \ + } + +#define TRANSFORM_VECTOR(in, out) { \ + __m128 vin = _mm_loadu_ps(in); \ + \ + /* x */ \ + __m128 x = _mm_shuffle_ps(vin, vin, 0x0); \ + __m128 t1 = _mm_mul_ps(x, m1); \ + \ + /* y, + x */ \ + __m128 y = _mm_shuffle_ps(vin, vin, 0x55); \ + __m128 t2 = _mm_mul_ps(y, m2); \ + __m128 t3 = _mm_add_ps(t1, t2); \ + \ + /* nz, + (ny + nx) */ \ + __m128 z = _mm_shuffle_ps(vin, vin, 0xaa); \ + __m128 t4 = _mm_mul_ps(z, m3); \ + __m128 vout = _mm_add_ps(t3, t4); \ + _mm_storeu_ps((out), vout); \ + } + + // transform vertex attributes by blended matrices + if (vertex3f) + { + const float * RESTRICT v = model->surfmesh.data_vertex3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + // special case common combinations of attributes to avoid repeated loading of matrices + if (normal3f) + { + const float * RESTRICT n = model->surfmesh.data_normal3f; + if (svector3f && tvector3f) + { + const float * RESTRICT sv = model->surfmesh.data_svector3f; + const float * RESTRICT tv = model->surfmesh.data_tvector3f; + + // Note that for SSE each iteration stores one element past end, so we break one vertex short + // and handle that with scalars in that case + for (i = 0; i < num_vertices_minus_one; i++, v += 3, n += 3, sv += 3, tv += 3, b++, + vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3) + { + LOAD_MATRIX4(); + TRANSFORM_POSITION(v, vertex3f); + TRANSFORM_VECTOR(n, normal3f); + TRANSFORM_VECTOR(sv, svector3f); + TRANSFORM_VECTOR(tv, tvector3f); + } + + // Last vertex needs to be done with scalars to avoid reading/writing 1 word past end of arrays + { + LOAD_MATRIX_SCALAR(); + TRANSFORM_POSITION_SCALAR(v, vertex3f); + TRANSFORM_VECTOR_SCALAR(n, normal3f); + TRANSFORM_VECTOR_SCALAR(sv, svector3f); + TRANSFORM_VECTOR_SCALAR(tv, tvector3f); + } + //printf("elapsed ticks: %llu\n", rdtsc() - ts); // XXX + return; + } + + for (i = 0;i < num_vertices_minus_one; i++, v += 3, n += 3, b++, vertex3f += 3, normal3f += 3) + { + LOAD_MATRIX4(); + TRANSFORM_POSITION(v, vertex3f); + TRANSFORM_VECTOR(n, normal3f); + } + { + LOAD_MATRIX_SCALAR(); + TRANSFORM_POSITION_SCALAR(v, vertex3f); + TRANSFORM_VECTOR_SCALAR(n, normal3f); + } + } + else + { + for (i = 0;i < num_vertices_minus_one; i++, v += 3, b++, vertex3f += 3) + { + LOAD_MATRIX4(); + TRANSFORM_POSITION(v, vertex3f); + } + { + LOAD_MATRIX_SCALAR(); + TRANSFORM_POSITION_SCALAR(v, vertex3f); + } + } + } + + else if (normal3f) + { + const float * RESTRICT n = model->surfmesh.data_normal3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + for (i = 0; i < num_vertices_minus_one; i++, n += 3, b++, normal3f += 3) + { + LOAD_MATRIX3(); + TRANSFORM_VECTOR(n, normal3f); + } + { + LOAD_MATRIX_SCALAR(); + TRANSFORM_VECTOR_SCALAR(n, normal3f); + } + } + + if (svector3f) + { + const float * RESTRICT sv = model->surfmesh.data_svector3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + for (i = 0; i < num_vertices_minus_one; i++, sv += 3, b++, svector3f += 3) + { + LOAD_MATRIX3(); + TRANSFORM_VECTOR(sv, svector3f); + } + { + LOAD_MATRIX_SCALAR(); + TRANSFORM_VECTOR_SCALAR(sv, svector3f); + } + } + + if (tvector3f) + { + const float * RESTRICT tv = model->surfmesh.data_tvector3f; + const unsigned short * RESTRICT b = model->surfmesh.blends; + for (i = 0; i < num_vertices_minus_one; i++, tv += 3, b++, tvector3f += 3) + { + LOAD_MATRIX3(); + TRANSFORM_VECTOR(tv, tvector3f); + } + { + LOAD_MATRIX_SCALAR(); + TRANSFORM_VECTOR_SCALAR(tv, tvector3f); + } + } + +#undef LOAD_MATRIX3 +#undef LOAD_MATRIX4 +#undef TRANSFORM_POSITION +#undef TRANSFORM_VECTOR +#undef LOAD_MATRIX_SCALAR +#undef TRANSFORM_POSITION_SCALAR +#undef TRANSFORM_VECTOR_SCALAR +} + +#endif diff --git a/mod_skeletal_animatevertices_sse.h b/mod_skeletal_animatevertices_sse.h new file mode 100644 index 00000000..7de55ca6 --- /dev/null +++ b/mod_skeletal_animatevertices_sse.h @@ -0,0 +1,10 @@ +#ifndef MOD_SKELTAL_ANIMATEVERTICES_SSE_H +#define MOD_SKELTAL_ANIMATEVERTICES_SSE_H + +#include "quakedef.h" + +#ifdef SSE_POSSIBLE +void Mod_Skeletal_AnimateVertices_SSE(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f); +#endif + +#endif diff --git a/model_alias.c b/model_alias.c index b731214a..2acd310e 100644 --- a/model_alias.c +++ b/model_alias.c @@ -21,7 +21,15 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #include "quakedef.h" #include "image.h" #include "r_shadow.h" +#include "mod_skeletal_animatevertices_generic.h" +#ifdef SSE_POSSIBLE +#include "mod_skeletal_animatevertices_sse.h" +#endif +#ifdef SSE_POSSIBLE +static qboolean r_skeletal_use_sse_defined = false; +cvar_t r_skeletal_use_sse = {0, "r_skeletal_use_sse", "1", "use SSE for skeletal model animation"}; +#endif cvar_t r_skeletal_debugbone = {0, "r_skeletal_debugbone", "-1", "development cvar for testing skeletal model code"}; cvar_t r_skeletal_debugbonecomponent = {0, "r_skeletal_debugbonecomponent", "3", "development cvar for testing skeletal model code"}; cvar_t r_skeletal_debugbonevalue = {0, "r_skeletal_debugbonevalue", "100", "development cvar for testing skeletal model code"}; @@ -32,6 +40,88 @@ cvar_t mod_alias_supporttagscale = {0, "mod_alias_supporttagscale", "1", "suppor float mod_md3_sin[320]; +static size_t Mod_Skeltal_AnimateVertices_maxbonepose = 0; +static void *Mod_Skeltal_AnimateVertices_bonepose = NULL; +void Mod_Skeletal_FreeBuffers(void) +{ + if(Mod_Skeltal_AnimateVertices_bonepose) + Mem_Free(Mod_Skeltal_AnimateVertices_bonepose); + Mod_Skeltal_AnimateVertices_maxbonepose = 0; + Mod_Skeltal_AnimateVertices_bonepose = NULL; +} +void *Mod_Skeletal_AnimateVertices_AllocBuffers(size_t nbytes) +{ + if(Mod_Skeltal_AnimateVertices_maxbonepose < nbytes) + { + Mem_Free(Mod_Skeltal_AnimateVertices_bonepose); + Mod_Skeltal_AnimateVertices_bonepose = Z_Malloc(nbytes); + Mod_Skeltal_AnimateVertices_maxbonepose = nbytes; + } + return Mod_Skeltal_AnimateVertices_bonepose; +} + +void Mod_Skeletal_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) +{ +#ifdef SSE_POSSIBLE + if(r_skeletal_use_sse_defined) + if(r_skeletal_use_sse.integer) + { + Mod_Skeletal_AnimateVertices_SSE(model, frameblend, skeleton, vertex3f, normal3f, svector3f, tvector3f); + return; + } +#endif + Mod_Skeletal_AnimateVertices_Generic(model, frameblend, skeleton, vertex3f, normal3f, svector3f, tvector3f); +} + +#ifdef SSE_POSSIBLE +#ifndef SSE_PRESENT +// code from SDL, shortened as we can expect CPUID to work +static int CPUID_Features(void) +{ + int features = 0; +# if defined(__GNUC__) && defined(__i386__) + __asm__ ( +" movl %%ebx,%%edi\n" +" xorl %%eax,%%eax \n" +" incl %%eax \n" +" cpuid # Get family/model/stepping/features\n" +" movl %%edx,%0 \n" +" movl %%edi,%%ebx\n" + : "=m" (features) + : + : "%eax", "%ecx", "%edx", "%edi" + ); +# elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) + __asm { + xor eax, eax + inc eax + cpuid ; Get family/model/stepping/features + mov features, edx + } +# else +# error SSE_POSSIBLE set but no CPUID implementation +# endif + return features; +} +#endif +static qboolean Have_SSE(void) +{ + // COMMANDLINEOPTION: SSE: -nosse disables SSE support and detection + if(COM_CheckParm("-nosse")) + return false; + // COMMANDLINEOPTION: SSE: -forcesse enables SSE support and disables detection +#ifdef SSE_PRESENT + return true; +#else + if(COM_CheckParm("-forcesse")) + return true; + if(CPUID_Features() & (1 << 25)) + return true; + return false; +#endif +} +#endif + void Mod_AliasInit (void) { int i; @@ -44,6 +134,20 @@ void Mod_AliasInit (void) Cvar_RegisterVariable(&mod_alias_supporttagscale); for (i = 0;i < 320;i++) mod_md3_sin[i] = sin(i * M_PI * 2.0f / 256.0); +#ifdef SSE_POSSIBLE + { + if(Have_SSE()) + { + Con_Printf("Skeletal animation uses SSE code path\n"); + r_skeletal_use_sse_defined = true; + Cvar_RegisterVariable(&r_skeletal_use_sse); + } + else + Con_Printf("Skeletal animation uses generic code path (SSE disabled or not detected)\n"); + } +#else + Con_Printf("Skeletal animation uses generic code path (SSE not compiled in)\n"); +#endif } int Mod_Skeletal_AddBlend(dp_model_t *model, const blendweights_t *newweights) @@ -106,216 +210,6 @@ int Mod_Skeletal_CompressBlend(dp_model_t *model, const int *newindex, const flo return Mod_Skeletal_AddBlend(model, &newweights); } -static int maxbonepose = 0; -static float (*bonepose)[12] = NULL; - -void Mod_Skeletal_FreeBuffers(void) -{ - if(bonepose) - Mem_Free(bonepose); - maxbonepose = 0; - bonepose = NULL; -} - -void Mod_Skeletal_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) -{ - // vertex weighted skeletal - int i, k; - int blends; - float m[12]; - float (*boneposerelative)[12]; - const blendweights_t * RESTRICT weights; - - if (maxbonepose < model->num_bones*2 + model->surfmesh.num_blends) - { - if (bonepose) - Z_Free(bonepose); - maxbonepose = model->num_bones*2 + model->surfmesh.num_blends; - bonepose = (float (*)[12])Z_Malloc(maxbonepose * sizeof(float[12])); - } - - boneposerelative = bonepose + model->num_bones; - - if (skeleton && !skeleton->relativetransforms) - skeleton = NULL; - - // interpolate matrices - if (skeleton) - { - for (i = 0;i < model->num_bones;i++) - { - Matrix4x4_ToArray12FloatD3D(&skeleton->relativetransforms[i], m); - if (model->data_bones[i].parent >= 0) - R_ConcatTransforms(bonepose[model->data_bones[i].parent], m, bonepose[i]); - else - memcpy(bonepose[i], m, sizeof(m)); - - // create a relative deformation matrix to describe displacement - // from the base mesh, which is used by the actual weighting - R_ConcatTransforms(bonepose[i], model->data_baseboneposeinverse + i * 12, boneposerelative[i]); - } - } - else - { - float originscale = model->num_posescale; - float x,y,z,w,lerp; - const short * RESTRICT pose6s; - for (i = 0;i < model->num_bones;i++) - { - memset(m, 0, sizeof(m)); - for (blends = 0;blends < MAX_FRAMEBLENDS && frameblend[blends].lerp > 0;blends++) - { - pose6s = model->data_poses6s + 6 * (frameblend[blends].subframe * model->num_bones + i); - lerp = frameblend[blends].lerp; - x = pose6s[3] * (1.0f / 32767.0f); - y = pose6s[4] * (1.0f / 32767.0f); - z = pose6s[5] * (1.0f / 32767.0f); - w = 1.0f - (x*x+y*y+z*z); - w = w > 0.0f ? -sqrt(w) : 0.0f; - m[ 0] += (1-2*(y*y+z*z)) * lerp; - m[ 1] += ( 2*(x*y-z*w)) * lerp; - m[ 2] += ( 2*(x*z+y*w)) * lerp; - m[ 3] += (pose6s[0] * originscale) * lerp; - m[ 4] += ( 2*(x*y+z*w)) * lerp; - m[ 5] += (1-2*(x*x+z*z)) * lerp; - m[ 6] += ( 2*(y*z-x*w)) * lerp; - m[ 7] += (pose6s[1] * originscale) * lerp; - m[ 8] += ( 2*(x*z-y*w)) * lerp; - m[ 9] += ( 2*(y*z+x*w)) * lerp; - m[10] += (1-2*(x*x+y*y)) * lerp; - m[11] += (pose6s[2] * originscale) * lerp; - } - VectorNormalize(m ); - VectorNormalize(m + 4); - VectorNormalize(m + 8); - if (i == r_skeletal_debugbone.integer) - m[r_skeletal_debugbonecomponent.integer % 12] += r_skeletal_debugbonevalue.value; - m[3] *= r_skeletal_debugtranslatex.value; - m[7] *= r_skeletal_debugtranslatey.value; - m[11] *= r_skeletal_debugtranslatez.value; - if (model->data_bones[i].parent >= 0) - R_ConcatTransforms(bonepose[model->data_bones[i].parent], m, bonepose[i]); - else - memcpy(bonepose[i], m, sizeof(m)); - // create a relative deformation matrix to describe displacement - // from the base mesh, which is used by the actual weighting - R_ConcatTransforms(bonepose[i], model->data_baseboneposeinverse + i * 12, boneposerelative[i]); - } - } - - // generate matrices for all blend combinations - weights = model->surfmesh.data_blendweights; - for (i = 0;i < model->surfmesh.num_blends;i++, weights++) - { - float * RESTRICT b = boneposerelative[model->num_bones + i]; - const float * RESTRICT m = boneposerelative[weights->index[0]]; - float f = weights->influence[0] * (1.0f / 255.0f); - b[ 0] = f*m[ 0]; b[ 1] = f*m[ 1]; b[ 2] = f*m[ 2]; b[ 3] = f*m[ 3]; - b[ 4] = f*m[ 4]; b[ 5] = f*m[ 5]; b[ 6] = f*m[ 6]; b[ 7] = f*m[ 7]; - b[ 8] = f*m[ 8]; b[ 9] = f*m[ 9]; b[10] = f*m[10]; b[11] = f*m[11]; - for (k = 1;k < 4 && weights->influence[k];k++) - { - m = boneposerelative[weights->index[k]]; - f = weights->influence[k] * (1.0f / 255.0f); - b[ 0] += f*m[ 0]; b[ 1] += f*m[ 1]; b[ 2] += f*m[ 2]; b[ 3] += f*m[ 3]; - b[ 4] += f*m[ 4]; b[ 5] += f*m[ 5]; b[ 6] += f*m[ 6]; b[ 7] += f*m[ 7]; - b[ 8] += f*m[ 8]; b[ 9] += f*m[ 9]; b[10] += f*m[10]; b[11] += f*m[11]; - } - } - - // transform vertex attributes by blended matrices - if (vertex3f) - { - const float * RESTRICT v = model->surfmesh.data_vertex3f; - const unsigned short * RESTRICT b = model->surfmesh.blends; - // special case common combinations of attributes to avoid repeated loading of matrices - if (normal3f) - { - const float * RESTRICT n = model->surfmesh.data_normal3f; - if (svector3f && tvector3f) - { - const float * RESTRICT sv = model->surfmesh.data_svector3f; - const float * RESTRICT tv = model->surfmesh.data_tvector3f; - for (i = 0;i < model->surfmesh.num_vertices;i++, v += 3, n += 3, sv += 3, tv += 3, b++, vertex3f += 3, normal3f += 3, svector3f += 3, tvector3f += 3) - { - const float * RESTRICT m = boneposerelative[*b]; - vertex3f[0] = (v[0] * m[0] + v[1] * m[1] + v[2] * m[ 2] + m[ 3]); - vertex3f[1] = (v[0] * m[4] + v[1] * m[5] + v[2] * m[ 6] + m[ 7]); - vertex3f[2] = (v[0] * m[8] + v[1] * m[9] + v[2] * m[10] + m[11]); - normal3f[0] = (n[0] * m[0] + n[1] * m[1] + n[2] * m[ 2]); - normal3f[1] = (n[0] * m[4] + n[1] * m[5] + n[2] * m[ 6]); - normal3f[2] = (n[0] * m[8] + n[1] * m[9] + n[2] * m[10]); - svector3f[0] = (sv[0] * m[0] + sv[1] * m[1] + sv[2] * m[ 2]); - svector3f[1] = (sv[0] * m[4] + sv[1] * m[5] + sv[2] * m[ 6]); - svector3f[2] = (sv[0] * m[8] + sv[1] * m[9] + sv[2] * m[10]); - tvector3f[0] = (tv[0] * m[0] + tv[1] * m[1] + tv[2] * m[ 2]); - tvector3f[1] = (tv[0] * m[4] + tv[1] * m[5] + tv[2] * m[ 6]); - tvector3f[2] = (tv[0] * m[8] + tv[1] * m[9] + tv[2] * m[10]); - } - return; - } - for (i = 0;i < model->surfmesh.num_vertices;i++, v += 3, n += 3, b++, vertex3f += 3, normal3f += 3) - { - const float * RESTRICT m = boneposerelative[*b]; - vertex3f[0] = (v[0] * m[0] + v[1] * m[1] + v[2] * m[ 2] + m[ 3]); - vertex3f[1] = (v[0] * m[4] + v[1] * m[5] + v[2] * m[ 6] + m[ 7]); - vertex3f[2] = (v[0] * m[8] + v[1] * m[9] + v[2] * m[10] + m[11]); - normal3f[0] = (n[0] * m[0] + n[1] * m[1] + n[2] * m[ 2]); - normal3f[1] = (n[0] * m[4] + n[1] * m[5] + n[2] * m[ 6]); - normal3f[2] = (n[0] * m[8] + n[1] * m[9] + n[2] * m[10]); - } - } - else - { - for (i = 0;i < model->surfmesh.num_vertices;i++, v += 3, b++, vertex3f += 3) - { - const float * RESTRICT m = boneposerelative[*b]; - vertex3f[0] = (v[0] * m[0] + v[1] * m[1] + v[2] * m[ 2] + m[ 3]); - vertex3f[1] = (v[0] * m[4] + v[1] * m[5] + v[2] * m[ 6] + m[ 7]); - vertex3f[2] = (v[0] * m[8] + v[1] * m[9] + v[2] * m[10] + m[11]); - } - } - } - else if (normal3f) - { - const float * RESTRICT n = model->surfmesh.data_normal3f; - const unsigned short * RESTRICT b = model->surfmesh.blends; - for (i = 0;i < model->surfmesh.num_vertices;i++, n += 3, b++, normal3f += 3) - { - const float * RESTRICT m = boneposerelative[*b]; - normal3f[0] = (n[0] * m[0] + n[1] * m[1] + n[2] * m[ 2]); - normal3f[1] = (n[0] * m[4] + n[1] * m[5] + n[2] * m[ 6]); - normal3f[2] = (n[0] * m[8] + n[1] * m[9] + n[2] * m[10]); - } - } - - if (svector3f) - { - const float * RESTRICT sv = model->surfmesh.data_svector3f; - const unsigned short * RESTRICT b = model->surfmesh.blends; - for (i = 0;i < model->surfmesh.num_vertices;i++, sv += 3, b++, svector3f += 3) - { - const float * RESTRICT m = boneposerelative[*b]; - svector3f[0] = (sv[0] * m[0] + sv[1] * m[1] + sv[2] * m[ 2]); - svector3f[1] = (sv[0] * m[4] + sv[1] * m[5] + sv[2] * m[ 6]); - svector3f[2] = (sv[0] * m[8] + sv[1] * m[9] + sv[2] * m[10]); - } - } - - if (tvector3f) - { - const float * RESTRICT tv = model->surfmesh.data_tvector3f; - const unsigned short * RESTRICT b = model->surfmesh.blends; - for (i = 0;i < model->surfmesh.num_vertices;i++, tv += 3, b++, tvector3f += 3) - { - const float * RESTRICT m = boneposerelative[*b]; - tvector3f[0] = (tv[0] * m[0] + tv[1] * m[1] + tv[2] * m[ 2]); - tvector3f[1] = (tv[0] * m[4] + tv[1] * m[5] + tv[2] * m[ 6]); - tvector3f[2] = (tv[0] * m[8] + tv[1] * m[9] + tv[2] * m[10]); - } - } -} - void Mod_MD3_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) { // vertex morph @@ -404,7 +298,6 @@ void Mod_MD3_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend } } } - void Mod_MDL_AnimateVertices(const dp_model_t * RESTRICT model, const frameblend_t * RESTRICT frameblend, const skeleton_t *skeleton, float * RESTRICT vertex3f, float * RESTRICT normal3f, float * RESTRICT svector3f, float * RESTRICT tvector3f) { // vertex morph diff --git a/model_alias.h b/model_alias.h index a7564883..ddf8e5c0 100644 --- a/model_alias.h +++ b/model_alias.h @@ -235,5 +235,14 @@ aliasbone_t; // for decoding md3 model latlong vertex normals extern float mod_md3_sin[320]; +extern cvar_t r_skeletal_debugbone; +extern cvar_t r_skeletal_debugbonecomponent; +extern cvar_t r_skeletal_debugbonevalue; +extern cvar_t r_skeletal_debugtranslatex; +extern cvar_t r_skeletal_debugtranslatey; +extern cvar_t r_skeletal_debugtranslatez; + +void *Mod_Skeletal_AnimateVertices_AllocBuffers(size_t nbytes); + #endif diff --git a/quakedef.h b/quakedef.h index 9e62f0d2..771804b3 100644 --- a/quakedef.h +++ b/quakedef.h @@ -445,15 +445,28 @@ extern cvar_t developer_loading; #if defined(__GNUC__) # if defined(__i386__) # define DP_ARCH_STR "686" +# define SSE_POSSIBLE # elif defined(__x86_64__) # define DP_ARCH_STR "x86_64" +# define SSE_PRESENT # elif defined(__powerpc__) # define DP_ARCH_STR "ppc" # endif #elif defined(_WIN64) # define DP_ARCH_STR "x86_64" +# define SSE_PRESENT #elif defined(WIN32) # define DP_ARCH_STR "x86" +# define SSE_POSSIBLE +#endif + +#ifdef SSE_PRESENT +# define SSE_POSSIBLE +#endif + +#ifdef NO_SSE +# undef SSE_PRESENT +# undef SSE_POSSIBLE #endif /// incremented every frame, never reset