# HG changeset patch # User Radek Brich # Date 1209053552 -7200 # Node ID e3a2a5b26abb40a637bc180ad4e57146ce6b9e0a # Parent 930a2d3ecaed4b4b4f57995f649a6a96de19d470 vectorize makeRayPacket() using SSE intrinsics diff -r 930a2d3ecaed -r e3a2a5b26abb include/common.h --- a/include/common.h Thu Apr 24 13:55:11 2008 +0200 +++ b/include/common.h Thu Apr 24 18:12:32 2008 +0200 @@ -33,6 +33,9 @@ #include #include +// SSE intrinsics +#include + using namespace std; #ifdef PYRIT_DOUBLE diff -r 930a2d3ecaed -r e3a2a5b26abb include/scene.h --- a/include/scene.h Thu Apr 24 13:55:11 2008 +0200 +++ b/include/scene.h Thu Apr 24 18:12:32 2008 +0200 @@ -80,8 +80,63 @@ void makeRayPacket(Sample *samples, Ray *rays) { + __m128 m1x,m1y,m1z; + __m128 m2x,m2y,m2z; + __m128 m; + + // m1(xyz) = u * samples[i].x + m1x = _mm_set1_ps(u.x); + m1y = _mm_set1_ps(u.y); + m1z = _mm_set1_ps(u.z); + m = _mm_set_ps(samples[0].x, samples[1].x, samples[2].x, samples[3].x); + m1x = _mm_mul_ps(m1x, m); + m1y = _mm_mul_ps(m1y, m); + m1z = _mm_mul_ps(m1z, m); + + // m2(xyz) = v * samples[i].y + m2x = _mm_set1_ps(v.x); + m2y = _mm_set1_ps(v.y); + m2z = _mm_set1_ps(v.z); + m = _mm_set_ps(samples[0].y, samples[1].y, samples[2].y, samples[3].y); + m2x = _mm_mul_ps(m2x, m); + m2y = _mm_mul_ps(m2y, m); + m2z = _mm_mul_ps(m2z, m); + + // m1(xyz) = (m1 + m2) = (u*samples[i].x + v*samples[i].y) + m1x = _mm_add_ps(m1x, m2x); + m1y = _mm_add_ps(m1y, m2y); + m1z = _mm_add_ps(m1z, m2z); + + // m1(xyz) = m1*F = (u*samples[i].x + v*samples[i].y)*F + m = _mm_set_ps(F,F,F,F); + m1x = _mm_mul_ps(m1x, m); + m1y = _mm_mul_ps(m1y, m); + m1z = _mm_mul_ps(m1z, m); + + // m1(xyz) = p - m1 = p - (u*samples[i].x + v*samples[i].y)*F = dir + m2x = _mm_set1_ps(p.x); + m2y = _mm_set1_ps(p.y); + m2z = _mm_set1_ps(p.z); + m2x = _mm_sub_ps(m2x, m1x); + m2y = _mm_sub_ps(m2y, m1y); + m2z = _mm_sub_ps(m2z, m1z); + + // normalize dir + m1x = _mm_mul_ps(m2x, m2x); // x*x + m1y = _mm_mul_ps(m2y, m2y); // y*y + m1z = _mm_mul_ps(m2z, m2z); // z*z + m = _mm_add_ps(m1x, m1y); // x*x + y*y + m = _mm_add_ps(m, m1z); // m = x*x + y*y + z*z + m = _mm_sqrt_ps(m); // m = sqrt(m) + m2x = _mm_div_ps(m2x, m); // dir(xyz) /= m + m2y = _mm_div_ps(m2y, m); + m2z = _mm_div_ps(m2z, m); + for (int i = 0; i < 4; i++) - rays[i] = makeRay(samples[i]); + { + Vector3 dir(((float*)&m2x)[3-i], ((float*)&m2y)[3-i], ((float*)&m2z)[3-i]); + rays[i] = Ray(eye, dir); + } }; }; diff -r 930a2d3ecaed -r e3a2a5b26abb include/vector.h --- a/include/vector.h Thu Apr 24 13:55:11 2008 +0200 +++ b/include/vector.h Thu Apr 24 18:12:32 2008 +0200 @@ -71,10 +71,10 @@ }; // get normalized copy - Vector3 unit() const + friend Vector3 normalize(Vector3 &v) { - Vector3 u(*this); - return u.normalize(); + const Float f = 1.0f / v.mag(); + return v * f; }; // square magnitude, magnitude diff -r 930a2d3ecaed -r e3a2a5b26abb src/raytracer.cc --- a/src/raytracer.cc Thu Apr 24 13:55:11 2008 +0200 +++ b/src/raytracer.cc Thu Apr 24 18:12:32 2008 +0200 @@ -404,7 +404,7 @@ pthread_mutex_unlock(&sample_queue_mutex); sampdone += my_count; - dbgmsg(2, "\b\b\b\b\b\b\b\b%2d%% done", (sampdone - sample_queue_count)*100/sampnum); + dbgmsg(2, "\b\b\b\b\b\b\b\b%2d%% done", ((long)sampdone - sample_queue_count)*100/sampnum); pthread_mutex_lock(&sampler_mutex);