From 9c9181154357884f68f23e5b997288b600b25608 Mon Sep 17 00:00:00 2001
From: eihrul <eihrul@d7cf8633-e32d-0410-b094-e92efae38249>
Date: Wed, 26 Jan 2011 15:35:52 +0000
Subject: [PATCH] SSE2 opts for nearest filtering

git-svn-id: svn://svn.icculus.org/twilight/trunk/darkplaces@10756 d7cf8633-e32d-0410-b094-e92efae38249
---
 dpsoftrast.c | 106 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 35 deletions(-)

diff --git a/dpsoftrast.c b/dpsoftrast.c
index 282e9a8a..46987887 100644
--- a/dpsoftrast.c
+++ b/dpsoftrast.c
@@ -1835,13 +1835,11 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(const DPSOFTRAST_State_Draw_Span
 				for (; x + 1 <= endsub; x += 2, subtcm = _mm_add_epi32(subtcm, substepm))
 				{
 					__m128i tcim = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtcm, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
-					ALIGN(int pixeloffset[4]);
 					tcim = _mm_madd_epi16(_mm_add_epi16(tcim, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), scalem);
-					_mm_store_si128((__m128i *)pixeloffset, tcim);
-					pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[pixeloffset[0]]), _mm_setzero_si128());
-					pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[pixeloffset[1]]), _mm_setzero_si128());
-					pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[pixeloffset[2]]), _mm_setzero_si128());
-					pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[pixeloffset[3]]), _mm_setzero_si128());
+					pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tcim)]), _mm_setzero_si128());
+					pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
+					pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
+					pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
 					fracm = _mm_srli_epi16(subtcm, 1);
 					pix1 = _mm_add_epi16(pix1,
 										 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
@@ -1859,11 +1857,9 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(const DPSOFTRAST_State_Draw_Span
 				if (x <= endsub)
 				{
 					__m128i tcim = _mm_shufflelo_epi16(subtcm, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
-					ALIGN(int pixeloffset[4]);
 					tcim = _mm_madd_epi16(_mm_add_epi16(tcim, _mm_setr_epi32(0, 0x10000, 0, 0)), scalem);
-					_mm_store_si128((__m128i *)pixeloffset, tcim);
-					pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[pixeloffset[0]]), _mm_setzero_si128());
-					pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[pixeloffset[1]]), _mm_setzero_si128());
+					pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tcim)]), _mm_setzero_si128());
+					pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
 					fracm = _mm_srli_epi16(subtcm, 1);
 					pix1 = _mm_add_epi16(pix1,
 										 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
@@ -1883,12 +1879,14 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(const DPSOFTRAST_State_Draw_Span
 				for (; x <= endsub; x++, subtcm = _mm_add_epi32(subtcm, substepm))
 				{
 					__m128i tcim = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtcm, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
-					ALIGN(int pixeloffset[4]);
 					tcim = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tcim, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), minm), maxm);
 					tcim = _mm_madd_epi16(tcim, scalem);
-					_mm_store_si128((__m128i *)pixeloffset, tcim);
-					pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[0]]), _mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[1]])), _mm_setzero_si128());
-					pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[2]]), _mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[3]])), _mm_setzero_si128());
+					pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)]), 
+																_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(1, 1, 1, 1)))])), 
+											_mm_setzero_si128());
+					pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(2, 2, 2, 2)))]), 
+																_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(3, 3, 3, 3)))])), 
+											_mm_setzero_si128());
 					fracm = _mm_srli_epi16(subtcm, 1);
 					pix1 = _mm_add_epi16(pix1,
 										 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
@@ -1908,12 +1906,14 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(const DPSOFTRAST_State_Draw_Span
 				{
 					__m128i tcim = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtcm, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)),
 							pix1, pix2, fracm;
-					ALIGN(int pixeloffset[4]);
 					tcim = _mm_and_si128(_mm_add_epi16(tcim, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), wrapm);
 					tcim = _mm_madd_epi16(tcim, scalem);
-					_mm_store_si128((__m128i *)pixeloffset, tcim);
-					pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[0]]), _mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[1]])), _mm_setzero_si128());
-					pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[2]]), _mm_cvtsi32_si128(*(const int *)&pixelbase[pixeloffset[3]])), _mm_setzero_si128());
+					pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)]),											
+																_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(1, 1, 1, 1)))])),
+											_mm_setzero_si128());
+					pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(2, 2, 2, 2)))]),
+																_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(3, 3, 3, 3)))])),
+											_mm_setzero_si128());
 					fracm = _mm_srli_epi16(subtcm, 1);
 					pix1 = _mm_add_epi16(pix1,
 										 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
@@ -1932,35 +1932,71 @@ void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(const DPSOFTRAST_State_Draw_Span
 			tci[1] = (subtc[1]>>16) - tcimin[1]; 
 			tci1[0] = ((subtc[0] + (endsub - x)*substep[0])>>16);
 			tci1[1] = ((subtc[1] + (endsub - x)*substep[1])>>16); 
-			if (tci[0] <= tcimax[0]-1 && tci[1] <= tcimax[1]-1 && tci1[0] <= tcimax[0]-1 && tci1[1] <= tcimax[1]-1)
+			if (tci[0] <= tcimax[0] && tci[1] <= tcimax[1] && tci1[0] <= tcimax[0] && tci1[1] <= tcimax[1])
 			{
-				for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+				__m128i subtcm = _mm_setr_epi32(subtc[0], subtc[1], subtc[0] + substep[0], subtc[1] + substep[1]);
+				__m128i substepm = _mm_slli_epi32(_mm_setr_epi32(substep[0], substep[1], substep[0], substep[1]), 1);
+				__m128i scalem = _mm_set1_epi32((tciwidth<<18)+4);
+				for (; x + 1 <= endsub; x += 2, subtcm = _mm_add_epi32(subtcm, substepm))
 				{
-					tci[0] = subtc[0]>>16;
-					tci[1] = subtc[1]>>16;
-					outi[x] = pixelbasei[(tci[1]*tciwidth+tci[0])];
+					__m128i tcim = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtcm, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
+					tcim = _mm_madd_epi16(tcim, scalem);
+					outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)];
+					outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(1, 1, 1, 1)))];
+				}
+				if (x <= endsub)
+				{
+					__m128i tcim = _mm_shufflelo_epi16(subtcm, _MM_SHUFFLE(3, 1, 3, 1));
+					tcim = _mm_madd_epi16(tcim, scalem);
+					outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)];
+					x++;
 				}
 			}
 			else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
 			{
-				for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+				__m128i subtcm = _mm_setr_epi32(subtc[0], subtc[1], subtc[0] + substep[0], subtc[1] + substep[1]);
+				__m128i substepm = _mm_slli_epi32(_mm_setr_epi32(substep[0], substep[1], substep[0], substep[1]), 1);
+				__m128i minm = _mm_slli_epi32(_mm_setr_epi32(tcimin[0], tcimin[1], tcimin[0], tcimin[1]), 16);
+				__m128i maxm = _mm_slli_epi32(_mm_setr_epi32(tcimax[0], tcimax[1], tcimax[0], tcimax[1]), 16);
+				__m128i scalem = _mm_set1_epi32((tciwidth<<18)+4);
+				for (; x + 1 <= endsub; x += 2, subtcm = _mm_add_epi32(subtcm, substepm))
 				{
-					tci[0] = subtc[0]>>16;
-					tci[1] = subtc[1]>>16;
-					tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
-					tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
-					outi[x] = pixelbasei[(tci[1]*tciwidth+tci[0])];
+					__m128i tcim = _mm_min_epi16(_mm_max_epi16(subtcm, minm), maxm); 
+					tcim = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tcim, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
+					tcim = _mm_madd_epi16(tcim, scalem);
+					outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)];
+					outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(1, 1, 1, 1)))];
+				}
+				if (x <= endsub)
+				{
+					__m128i tcim = _mm_min_epi16(_mm_max_epi16(subtcm, minm), maxm);
+					tcim = _mm_shufflelo_epi16(tcim, _MM_SHUFFLE(3, 1, 3, 1));
+					tcim = _mm_madd_epi16(tcim, scalem);
+					outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)];
+					x++;
 				}
 			}
 			else
 			{
-				for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
+				__m128i subtcm = _mm_setr_epi32(subtc[0], subtc[1], subtc[0] + substep[0], subtc[1] + substep[1]);
+				__m128i substepm = _mm_slli_epi32(_mm_setr_epi32(substep[0], substep[1], substep[0], substep[1]), 1);
+				__m128i wrapm = _mm_slli_epi32(_mm_setr_epi32(tciwrapmask[0], tciwrapmask[1], tciwrapmask[0], tciwrapmask[1]), 16);
+				__m128i scalem = _mm_set1_epi32((tciwidth<<18)+4);
+				for (; x + 1 <= endsub; x += 2, subtcm = _mm_add_epi32(subtcm, substepm))
 				{
-					tci[0] = subtc[0]>>16;
-					tci[1] = subtc[1]>>16;
-					tci[0] &= tciwrapmask[0];
-					tci[1] &= tciwrapmask[1];
-					outi[x] = pixelbasei[(tci[1]*tciwidth+tci[0])];
+					__m128i tcim = _mm_and_si128(subtcm, wrapm); 
+					tcim = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tcim, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
+					tcim = _mm_madd_epi16(tcim, scalem);
+					outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)];
+					outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tcim, _MM_SHUFFLE(1, 1, 1, 1)))];
+				}
+				if (x <= endsub)
+				{
+					__m128i tcim = _mm_and_si128(subtcm, wrapm); 
+					tcim = _mm_shufflelo_epi16(tcim, _MM_SHUFFLE(3, 1, 3, 1));
+					tcim = _mm_madd_epi16(tcim, scalem);
+					outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tcim)];
+					x++;
 				}
 			}
 		}
-- 
2.39.5