From: eihrul <eihrul@d7cf8633-e32d-0410-b094-e92efae38249>
Date: Sat, 5 Feb 2011 00:19:12 +0000 (+0000)
Subject: optimized MultiplyVaryingBGRA8 and VaryingBGRA8
X-Git-Tag: xonotic-v0.5.0~438^2~39
X-Git-Url: https://git.rm.cloudns.org/?a=commitdiff_plain;h=49c37132e5ba08c4cc19b11a8c8ae204eea262e5;p=xonotic%2Fdarkplaces.git

optimized MultiplyVaryingBGRA8 and VaryingBGRA8


git-svn-id: svn://svn.icculus.org/twilight/trunk/darkplaces@10802 d7cf8633-e32d-0410-b094-e92efae38249
---

diff --git a/dpsoftrast.c b/dpsoftrast.c
index aff3befe..c6b8cf21 100644
--- a/dpsoftrast.c
+++ b/dpsoftrast.c
@@ -6,7 +6,7 @@
 #include "dpsoftrast.h"
 
 #ifdef USE_SDL
-#define USE_THREADS
+//#define USE_THREADS
 #endif
 
 #ifdef USE_THREADS
@@ -2859,29 +2859,42 @@ void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle *
 	int startx = span->startx;
 	int endx = span->endx;
 	__m128 data, slope;
+	__m128 mod, endmod;
+	__m128i submod, substep, endsubmod;
 	DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
 	data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
 	slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
-	data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
-	data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
-	slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
-	for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
-	{
-		__m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
-		__m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
-		data = _mm_add_ps(data, slope);
-		mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
-		mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
-		pix = _mm_mulhi_epu16(pix, mod);
-		_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
-	}
-	for (;x < endx;x++, data = _mm_add_ps(data, slope))
+	endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+	endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
+	for (x = startx; x < endx;)
 	{
-		__m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
-		__m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
-		mod = _mm_packs_epi32(mod, mod);
-		pix = _mm_mulhi_epu16(pix, mod);
-		*(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+		int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
+		__m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
+		if(nextsub >= endx)
+		{
+			nextsub = endsub = endx-1;
+			if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
+		}
+		mod = endmod;
+		submod = endsubmod;
+		endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+		substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
+		endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
+		submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
+		substep = _mm_packs_epi32(substep, substep);
+		for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
+		{
+			__m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
+			pix = _mm_mulhi_epu16(pix, submod);
+			_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
+		}
+		if (x <= endsub)
+		{
+			__m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
+			pix = _mm_mulhi_epu16(pix, submod);
+			*(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+			x++;
+		}
 	}
 #endif
 }
@@ -2893,25 +2906,40 @@ void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRIC
 	int startx = span->startx;
 	int endx = span->endx;
 	__m128 data, slope;
+	__m128 mod, endmod;
+	__m128i submod, substep, endsubmod;
 	DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
 	data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
 	slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
-	data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
-	data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
-	slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
-	for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
-	{
-		__m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
-		data = _mm_add_ps(data, slope);
-		pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
-		pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
-		_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
-	}
-	for (;x < endx;x++, data = _mm_add_ps(data, slope))
+	endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+	endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
+	for (x = startx; x < endx;)
 	{
-		__m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
-		pix = _mm_packs_epi32(pix, pix);
-		*(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+		int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
+		__m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
+		if(nextsub >= endx)
+		{
+			nextsub = endsub = endx-1;
+			if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
+		}
+		mod = endmod;
+		submod = endsubmod;
+		endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+		substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
+		endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
+		submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
+		substep = _mm_packs_epi32(substep, substep);
+		for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
+		{
+			__m128i pix = _mm_srai_epi16(submod, 4);
+			_mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
+		}
+		if (x <= endsub)
+		{
+			__m128i pix = _mm_srai_epi16(submod, 4);
+			*(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
+			x++;
+		}
 	}
 #endif
 }