vectorize makeRayPacket() using SSE intrinsics pyrit
authorRadek Brich <radek.brich@devl.cz>
Thu, 24 Apr 2008 18:12:32 +0200
branchpyrit
changeset 83 e3a2a5b26abb
parent 82 930a2d3ecaed
child 84 6f7fe14782c2
vectorize makeRayPacket() using SSE intrinsics
include/common.h
include/scene.h
include/vector.h
src/raytracer.cc
--- a/include/common.h	Thu Apr 24 13:55:11 2008 +0200
+++ b/include/common.h	Thu Apr 24 18:12:32 2008 +0200
@@ -33,6 +33,9 @@
 #include <pthread.h>
 #include <string>
 
+// SSE intrinsics
+#include <xmmintrin.h>
+
 using namespace std;
 
 #ifdef PYRIT_DOUBLE
--- a/include/scene.h	Thu Apr 24 13:55:11 2008 +0200
+++ b/include/scene.h	Thu Apr 24 18:12:32 2008 +0200
@@ -80,8 +80,63 @@
 
 	void makeRayPacket(Sample *samples, Ray *rays)
 	{
+		__m128 m1x,m1y,m1z;
+		__m128 m2x,m2y,m2z;
+		__m128 m;
+		
+		// m1(xyz) = u * samples[i].x
+		m1x = _mm_set1_ps(u.x);
+		m1y = _mm_set1_ps(u.y);
+		m1z = _mm_set1_ps(u.z);
+		m = _mm_set_ps(samples[0].x, samples[1].x, samples[2].x, samples[3].x);
+		m1x = _mm_mul_ps(m1x, m);
+		m1y = _mm_mul_ps(m1y, m);
+		m1z = _mm_mul_ps(m1z, m);
+		
+		// m2(xyz) = v * samples[i].y
+		m2x = _mm_set1_ps(v.x);
+		m2y = _mm_set1_ps(v.y);
+		m2z = _mm_set1_ps(v.z);
+		m = _mm_set_ps(samples[0].y, samples[1].y, samples[2].y, samples[3].y);
+		m2x = _mm_mul_ps(m2x, m);
+		m2y = _mm_mul_ps(m2y, m);
+		m2z = _mm_mul_ps(m2z, m);
+		
+		// m1(xyz) = (m1 + m2) = (u*samples[i].x + v*samples[i].y)
+		m1x = _mm_add_ps(m1x, m2x);
+		m1y = _mm_add_ps(m1y, m2y);
+		m1z = _mm_add_ps(m1z, m2z);
+		
+		// m1(xyz) = m1*F = (u*samples[i].x + v*samples[i].y)*F
+		m = _mm_set_ps(F,F,F,F);
+		m1x = _mm_mul_ps(m1x, m);
+		m1y = _mm_mul_ps(m1y, m);
+		m1z = _mm_mul_ps(m1z, m);
+		
+		// m1(xyz) = p - m1 = p - (u*samples[i].x + v*samples[i].y)*F = dir
+		m2x = _mm_set1_ps(p.x);
+		m2y = _mm_set1_ps(p.y);
+		m2z = _mm_set1_ps(p.z);
+		m2x = _mm_sub_ps(m2x, m1x);
+		m2y = _mm_sub_ps(m2y, m1y);
+		m2z = _mm_sub_ps(m2z, m1z);
+		
+		// normalize dir
+		m1x = _mm_mul_ps(m2x, m2x); // x*x
+		m1y = _mm_mul_ps(m2y, m2y); // y*y
+		m1z = _mm_mul_ps(m2z, m2z); // z*z
+		m = _mm_add_ps(m1x, m1y);   // x*x + y*y
+		m = _mm_add_ps(m, m1z);     // m = x*x + y*y + z*z
+		m = _mm_sqrt_ps(m);         // m = sqrt(m)
+		m2x = _mm_div_ps(m2x, m);   // dir(xyz) /= m
+		m2y = _mm_div_ps(m2y, m);
+		m2z = _mm_div_ps(m2z, m);
+		
 		for (int i = 0; i < 4; i++)
-			rays[i] = makeRay(samples[i]);
+		{
+			Vector3 dir(((float*)&m2x)[3-i], ((float*)&m2y)[3-i], ((float*)&m2z)[3-i]);
+			rays[i] = Ray(eye, dir);
+		}
 	};
 };
 
--- a/include/vector.h	Thu Apr 24 13:55:11 2008 +0200
+++ b/include/vector.h	Thu Apr 24 18:12:32 2008 +0200
@@ -71,10 +71,10 @@
 	};
 
 	// get normalized copy
-	Vector3 unit() const
+	friend Vector3 normalize(Vector3 &v)
 	{
-		Vector3 u(*this);
-		return u.normalize();
+		const Float f = 1.0f / v.mag();
+		return v * f;
 	};
 
 	// square magnitude, magnitude
--- a/src/raytracer.cc	Thu Apr 24 13:55:11 2008 +0200
+++ b/src/raytracer.cc	Thu Apr 24 18:12:32 2008 +0200
@@ -404,7 +404,7 @@
 			pthread_mutex_unlock(&sample_queue_mutex);
 
 			sampdone += my_count;
-			dbgmsg(2, "\b\b\b\b\b\b\b\b%2d%% done", (sampdone - sample_queue_count)*100/sampnum);
+			dbgmsg(2, "\b\b\b\b\b\b\b\b%2d%% done", ((long)sampdone - sample_queue_count)*100/sampnum);
 
 			pthread_mutex_lock(&sampler_mutex);