# HG changeset patch # User Radek Brich # Date 1209329717 -7200 # Node ID 1081e3dd3f3e421a184a5752116e391c67e767c8 # Parent ce6abe0aeeae1bcb3903dbc6f78febaf4f4d38f7 Sphere, Box - RayPacket intersection replace 5x oversampling with 4x uniform oversampling diff -r ce6abe0aeeae -r 1081e3dd3f3e include/sampler.h --- a/include/sampler.h Sun Apr 27 19:56:23 2008 +0200 +++ b/include/sampler.h Sun Apr 27 22:55:17 2008 +0200 @@ -79,7 +79,7 @@ { int phase; int subsample; // 0,1 = no, 1+ = size of sampling grid - int oversample; // 0 = no, 1 = 5x, 2 = 9x, 3 = 16x + int oversample; // 0 = no, 1 = 4x, 2 = 9x, 3 = 16x int sx,sy,osa_samp; // current sample properties public: DefaultSampler(Float *abuffer, int &aw, int &ah): diff -r ce6abe0aeeae -r 1081e3dd3f3e include/shapes.h --- a/include/shapes.h Sun Apr 27 19:56:23 2008 +0200 +++ b/include/shapes.h Sun Apr 27 22:55:17 2008 +0200 @@ -100,6 +100,7 @@ sqr_radius(aradius*aradius), inv_radius(1.0f/aradius) { material = amaterial; } bool intersect(const Ray &ray, Float &dist) const; + __m128 intersect_packet(const RayPacket &rays, __m128 &dists); bool intersect_all(const Ray &ray, Float dist, vector &allts) const; bool intersect_bbox(const BBox &bbox) const; const Vector3 normal(const Vector3 &P) const { return (P - center) * inv_radius; }; @@ -125,6 +126,7 @@ material = amaterial; }; bool intersect(const Ray &ray, Float &dist) const; + __m128 intersect_packet(const RayPacket &rays, __m128 &dists); bool intersect_all(const Ray &ray, Float dist, vector &allts) const { return false; }; bool intersect_bbox(const BBox &bbox) const; const Vector3 normal(const Vector3 &P) const; diff -r ce6abe0aeeae -r 1081e3dd3f3e include/vector.h --- a/include/vector.h Sun Apr 27 19:56:23 2008 +0200 +++ b/include/vector.h Sun Apr 27 22:55:17 2008 +0200 @@ -211,6 +211,8 @@ VectorPacket() {}; VectorPacket(__m128 ax, __m128 ay, __m128 az): mx(ax), my(ay), mz(az) {}; + VectorPacket(const Vector3 &v): + mx(_mm_set_ps1(v.x)), my(_mm_set_ps1(v.y)), mz(_mm_set_ps1(v.z)) {}; Vector3 getVector(int i) const { @@ -232,6 +234,15 @@ mz = _mm_mul_ps(mz, m); }; + // dot product + friend __m128 dot(const VectorPacket &a, const VectorPacket &b) + { + return _mm_add_ps(_mm_add_ps( + _mm_mul_ps(a.mx, b.mx), + _mm_mul_ps(a.my, b.my)), + _mm_mul_ps(a.mz, b.mz)); + }; + friend VectorPacket operator+(const VectorPacket &a, const VectorPacket &b) { return VectorPacket( diff -r ce6abe0aeeae -r 1081e3dd3f3e src/kdtree.cc --- a/src/kdtree.cc Sun Apr 27 19:56:23 2008 +0200 +++ b/src/kdtree.cc Sun Apr 27 22:55:17 2008 +0200 @@ -365,15 +365,14 @@ stack[entry].t = a; /* distinguish between internal and external origin of a ray*/ - stack[entry].pb = rays.o + rays.dir * a; /* external */ - for (int i = 0; i < 4; i++) - if (((float*)&a)[i] < 0.0) - { - /* internal */ - stack[entry].pb.x[i] = rays.o.x[i]; - stack[entry].pb.y[i] = rays.o.y[i]; - stack[entry].pb.z[i] = rays.o.z[i]; - } + t = _mm_cmplt_ps(a, mZero); + stack[entry].pb = rays.o + rays.dir * a; + stack[entry].pb.mx = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.mx), + _mm_and_ps(t, rays.o.mx)); + stack[entry].pb.my = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.my), + _mm_and_ps(t, rays.o.my)); + stack[entry].pb.mz = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.mz), + _mm_and_ps(t, rays.o.mz)); /* setup initial exit point in the stack */ stack[exit].t = b; @@ -396,11 +395,11 @@ axis = node->getAxis(); // mask out invalid rays with near > far - __m128 curmask = _mm_and_ps(mask, _mm_cmple_ps(stack[entry].t, stack[exit].t)); - __m128 entry_lt = _mm_cmplt_ps(stack[entry].pb.ma[axis], splitVal); - __m128 entry_gt = _mm_cmpgt_ps(stack[entry].pb.ma[axis], splitVal); - __m128 exit_lt = _mm_cmplt_ps(stack[exit].pb.ma[axis], splitVal); - __m128 exit_gt = _mm_cmpgt_ps(stack[exit].pb.ma[axis], splitVal); + const __m128 curmask = _mm_and_ps(mask, _mm_cmple_ps(stack[entry].t, stack[exit].t)); + const __m128 entry_lt = _mm_cmplt_ps(stack[entry].pb.ma[axis], splitVal); + const __m128 entry_gt = _mm_cmpgt_ps(stack[entry].pb.ma[axis], splitVal); + const __m128 exit_lt = _mm_cmplt_ps(stack[exit].pb.ma[axis], splitVal); + const __m128 exit_gt = _mm_cmpgt_ps(stack[exit].pb.ma[axis], splitVal); // if all of // stack[entry].pb[axis] <= splitVal, @@ -488,7 +487,8 @@ { results = (*shape)->intersect_packet(rays, dists); int valid = _mm_movemask_ps( - _mm_and_ps(mask, _mm_and_ps(results, _mm_cmpge_ps(dists, _mm_sub_ps(stack[entry].t, mEps))))); + _mm_and_ps(mask, _mm_and_ps(results, + _mm_cmpge_ps(dists, _mm_sub_ps(stack[entry].t, mEps))))); for (int i = 0; i < 4; i++) { if (*shape != origin_shapes[i] && ((valid>>i)&1)) diff -r ce6abe0aeeae -r 1081e3dd3f3e src/sampler.cc --- a/src/sampler.cc Sun Apr 27 19:56:23 2008 +0200 +++ b/src/sampler.cc Sun Apr 27 22:55:17 2008 +0200 @@ -178,9 +178,9 @@ else if (phase == 2) { /* grid oversampling */ - static const int gridsamples[] = {1,5,9,16}; - static const Float osa5x[] = {0.0, -0.4, +0.4, +0.4, -0.4}; - static const Float osa5y[] = {0.0, -0.4, -0.4, +0.4, +0.4}; + static const int gridsamples[] = {1,4,9,16}; + static const Float osa4x[] = {-0.25, +0.25, +0.25, -0.25}; + static const Float osa4y[] = {-0.25, -0.25, +0.25, +0.25}; static const Float osa9x[] = {-0.34, 0.00, +0.34, -0.34, 0.00, +0.34, -0.34, 0.00, +0.34}; static const Float osa9y[] = {-0.34, -0.34, -0.34, @@ -191,8 +191,8 @@ static const Float osa16y[] = {-0.375, -0.375, -0.375, -0.375, -0.125, -0.125, -0.125, -0.125, +0.125, +0.125, +0.125, +0.125, +0.375, +0.375, +0.375, +0.375}; - static const Float *osaSx[] = {NULL, osa5x, osa9x, osa16x}; - static const Float *osaSy[] = {NULL, osa5y, osa9y, osa16y}; + static const Float *osaSx[] = {NULL, osa4x, osa9x, osa16x}; + static const Float *osaSy[] = {NULL, osa4y, osa9y, osa16y}; const int samples = gridsamples[oversample]; const Float *osax = osaSx[oversample]; const Float *osay = osaSy[oversample]; diff -r ce6abe0aeeae -r 1081e3dd3f3e src/shapes.cc --- a/src/shapes.cc Sun Apr 27 19:56:23 2008 +0200 +++ b/src/shapes.cc Sun Apr 27 22:55:17 2008 +0200 @@ -54,6 +54,32 @@ return false; } +__m128 Sphere::intersect_packet(const RayPacket &rays, __m128 &dists) +{ + VectorPacket V = rays.o - VectorPacket(center); + register __m128 d = _mm_sub_ps(mZero, dot(V, rays.dir)); + register __m128 Det = _mm_sub_ps(_mm_mul_ps(d, d), + _mm_sub_ps(dot(V,V), _mm_set_ps1(sqr_radius))); + register __m128 t1, t2, mask; + + mask = _mm_cmpgt_ps(Det, mZero); + if (!_mm_movemask_ps(mask)) + return mask; + + Det = _mm_sqrt_ps(Det); + t1 = _mm_sub_ps(d, Det); + t2 = _mm_add_ps(d, Det); + + mask = _mm_and_ps(mask, _mm_cmpgt_ps(t2, mZero)); + + const __m128 cond1 = _mm_and_ps(_mm_cmpgt_ps(t1, mZero), _mm_cmplt_ps(t1, dists)); + const __m128 cond2 = _mm_and_ps(_mm_cmple_ps(t1, mZero), _mm_cmplt_ps(t2, dists)); + const __m128 newdists = _mm_or_ps(_mm_and_ps(cond1, t1), _mm_and_ps(cond2, t2)); + mask = _mm_and_ps(mask, _mm_or_ps(cond1, cond2)); + dists = _mm_or_ps(_mm_and_ps(mask, newdists), _mm_andnot_ps(mask, dists)); + return mask; +} + /* if there should be CSG sometimes, this may be needed... */ bool Sphere::intersect_all(const Ray &ray, Float dist, vector &allts) const { @@ -149,6 +175,47 @@ return false; } +__m128 Box::intersect_packet(const RayPacket &rays, __m128 &dists) +{ + register __m128 tnear = mZero; + register __m128 tfar = mInf; + register __m128 t1, t2; + register __m128 mask = mAllSet; + + for (int i = 0; i < 3; i++) + { + const __m128 mL = _mm_set_ps1(L[i]); + const __m128 mH = _mm_set_ps1(H[i]); + mask = _mm_and_ps(mask, + _mm_or_ps( + _mm_or_ps(_mm_cmplt_ps(rays.dir.ma[i], mMEps), _mm_cmpgt_ps(rays.dir.ma[i], mEps)), + _mm_and_ps(_mm_cmpge_ps(rays.o.ma[i], mL), _mm_cmple_ps(rays.o.ma[i], mH)) + )); + if (!_mm_movemask_ps(mask)) + return mask; + + /* compute the intersection distance of the planes */ + t1 = _mm_div_ps(_mm_sub_ps(mL, rays.o.ma[i]), rays.dir.ma[i]); + t2 = _mm_div_ps(_mm_sub_ps(mH, rays.o.ma[i]), rays.dir.ma[i]); + + __m128 t = _mm_min_ps(t1, t2); + t2 = _mm_max_ps(t1, t2); + t1 = t; + + tnear = _mm_max_ps(tnear, t1); /* want largest Tnear */ + tfar = _mm_min_ps(tfar, t2); /* want smallest Tfar */ + + mask = _mm_and_ps(mask, + _mm_and_ps(_mm_cmple_ps(tnear, tfar), _mm_cmpge_ps(tfar, mZero))); + if (!_mm_movemask_ps(mask)) + return mask; + } + + mask = _mm_and_ps(mask, _mm_cmplt_ps(tnear, dists)); + dists = _mm_or_ps(_mm_and_ps(mask, tnear), _mm_andnot_ps(mask, dists)); + return mask; +} + bool Box::intersect_bbox(const BBox &bbox) const { return ( @@ -389,9 +456,7 @@ if (!_mm_movemask_ps(mask)) return mask; - for (int i = 0; i < 4; i++) - if ((_mm_movemask_ps(mask)>>i)&1) - ((float*)&dists)[i] = ((float*)&t)[i]; + dists = _mm_or_ps(_mm_andnot_ps(mask, dists), _mm_and_ps(mask, t)); return mask; } #endif