--- a/include/common.h Sun Apr 27 14:19:37 2008 +0200
+++ b/include/common.h Sun Apr 27 19:56:23 2008 +0200
@@ -48,6 +48,13 @@
# define Inf FLT_MAX
#endif
+const __m128 mZero = _mm_set_ps1(0.);
+const __m128 mOne = _mm_set_ps1(1.);
+const __m128 mEps = _mm_set_ps1(Eps);
+const __m128 mMEps = _mm_set_ps1(-Eps);
+const __m128 mInf = _mm_set_ps1(Inf);
+const __m128 mAllSet = _mm_cmplt_ps(mZero, mOne);
+
/* verbosity level:
0: only errors (E)
1: major status messages (*)
--- a/include/scene.h Sun Apr 27 14:19:37 2008 +0200
+++ b/include/scene.h Sun Apr 27 19:56:23 2008 +0200
@@ -178,13 +178,7 @@
Float h() { return H.y-L.y; };
Float d() { return H.z-L.z; };
bool intersect(const Ray &ray, Float &a, Float &b);
- bool intersect_packet(const RayPacket &rays, __m128 &a, __m128 &b)
- {
- return intersect(rays[0], ((float*)&a)[0], ((float*)&b)[0])
- || intersect(rays[1], ((float*)&a)[1], ((float*)&b)[1])
- || intersect(rays[2], ((float*)&a)[2], ((float*)&b)[2])
- || intersect(rays[3], ((float*)&a)[3], ((float*)&b)[3]);
- };
+ __m128 intersect_packet(const RayPacket &rays, __m128 &a, __m128 &b);
};
#endif
--- a/include/vector.h Sun Apr 27 14:19:37 2008 +0200
+++ b/include/vector.h Sun Apr 27 19:56:23 2008 +0200
@@ -191,11 +191,6 @@
typedef Vector3 Colour;
-
-static const __m128 zeros = _mm_set_ps1(0.);
-static const __m128 ones = _mm_set_ps1(1.);
-static const __m128 mEps = _mm_set_ps1(Eps);
-
class VectorPacket
{
public:
@@ -231,7 +226,7 @@
m = _mm_add_ps(x, y);
m = _mm_add_ps(m, z); // x*x + y*y + z*z
m = _mm_sqrt_ps(m);
- m = _mm_div_ps(ones, m); // m = 1/sqrt(m)
+ m = _mm_div_ps(mOne, m); // m = 1/sqrt(m)
mx = _mm_mul_ps(mx, m);
my = _mm_mul_ps(my, m);
mz = _mm_mul_ps(mz, m);
@@ -272,7 +267,7 @@
// write to character stream
friend ostream & operator<<(ostream &st, const VectorPacket &v)
{
- return st << "[" << v.getVector(0) << "," << v.getVector(1)
+ return st << "[" << v.getVector(0) << "," << v.getVector(1)
<< "," << v.getVector(2) << "," << v.getVector(3) << ")";
};
--- a/src/kdtree.cc Sun Apr 27 14:19:37 2008 +0200
+++ b/src/kdtree.cc Sun Apr 27 19:56:23 2008 +0200
@@ -342,27 +342,16 @@
{
__m128 a, b; /* entry/exit signed distance */
__m128 t; /* signed distance to the splitting plane */
- __m128 mask = zeros;
+ __m128 mask = mZero;
/* if we have no tree, fall back to naive test */
if (!built)
Container::packet_intersection(origin_shapes, rays, nearest_distances, nearest_shapes);
- nearest_shapes[0] = NULL;
- nearest_shapes[1] = NULL;
- nearest_shapes[2] = NULL;
- nearest_shapes[3] = NULL;
+ // nearest_shapes[0..4] = NULL
+ memset(nearest_shapes, 0, 4*sizeof(Shape*));
- //bbox.intersect_packet(rays, a, b)
- if (bbox.intersect(rays[0], ((float*)&a)[0], ((float*)&b)[0]))
- ((int*)&mask)[0] = -1;
- if (bbox.intersect(rays[1], ((float*)&a)[1], ((float*)&b)[1]))
- ((int*)&mask)[1] = -1;
- if (bbox.intersect(rays[2], ((float*)&a)[2], ((float*)&b)[2]))
- ((int*)&mask)[2] = -1;
- if (bbox.intersect(rays[3], ((float*)&a)[3], ((float*)&b)[3]))
- ((int*)&mask)[3] = -1;
-
+ mask = bbox.intersect_packet(rays, a, b);
if (!_mm_movemask_ps(mask))
return;
@@ -396,7 +385,7 @@
__m128 splitVal;
int axis;
static const int mod3[] = {0,1,2,0,1};
- const VectorPacket invdirs = ones / rays.dir;
+ const VectorPacket invdirs = mOne / rays.dir;
while (node)
{
/* loop until a leaf is found */
@@ -494,6 +483,7 @@
__m128 dists = stack[exit].t;
ShapeList::iterator shape;
__m128 results;
+ __m128 newmask = mask;
for (shape = node->getShapes()->begin(); shape != node->getShapes()->end(); shape++)
{
results = (*shape)->intersect_packet(rays, dists);
@@ -505,14 +495,12 @@
{
nearest_shapes[i] = *shape;
nearest_distances[i] = ((float*)&dists)[i];
+ ((int*)&newmask)[i] = 0;
}
}
}
- for (int i = 0; i < 4; i++)
- if (nearest_shapes[i])
- ((int*)&mask)[i] = 0;
-
+ mask = newmask;
if (!_mm_movemask_ps(mask))
return;
--- a/src/raytracer.cc Sun Apr 27 14:19:37 2008 +0200
+++ b/src/raytracer.cc Sun Apr 27 19:56:23 2008 +0200
@@ -248,7 +248,6 @@
Shape *nearest_shapes[4];
static const Shape *origin_shapes[4] = {NULL, NULL, NULL, NULL};
top->packet_intersection(origin_shapes, rays, nearest_distances, nearest_shapes);
- Ray ray;
for (int i = 0; i < 4; i++)
if (nearest_shapes[i] == NULL)
--- a/src/scene.cc Sun Apr 27 14:19:37 2008 +0200
+++ b/src/scene.cc Sun Apr 27 19:56:23 2008 +0200
@@ -107,3 +107,45 @@
b = tfar;
return true;
}
+
+// rewrite of BBox::intersect for ray packets
+__m128 BBox::intersect_packet(const RayPacket &rays, __m128 &a, __m128 &b)
+{
+ register __m128 tnear = mZero;
+ register __m128 tfar = mInf;
+ register __m128 t1, t2;
+ register __m128 mask = mAllSet;
+
+ for (int i = 0; i < 3; i++)
+ {
+ const __m128 mL = _mm_set_ps1(L[i]);
+ const __m128 mH = _mm_set_ps1(H[i]);
+ mask = _mm_and_ps(mask,
+ _mm_or_ps(
+ _mm_or_ps(_mm_cmplt_ps(rays.dir.ma[i], mMEps), _mm_cmpgt_ps(rays.dir.ma[i], mEps)),
+ _mm_and_ps(_mm_cmpge_ps(rays.o.ma[i], mL), _mm_cmple_ps(rays.o.ma[i], mH))
+ ));
+ if (!_mm_movemask_ps(mask))
+ return mask;
+
+ /* compute the intersection distance of the planes */
+ t1 = _mm_div_ps(_mm_sub_ps(mL, rays.o.ma[i]), rays.dir.ma[i]);
+ t2 = _mm_div_ps(_mm_sub_ps(mH, rays.o.ma[i]), rays.dir.ma[i]);
+
+ __m128 t = _mm_min_ps(t1, t2);
+ t2 = _mm_max_ps(t1, t2);
+ t1 = t;
+
+ tnear = _mm_max_ps(tnear, t1); /* want largest Tnear */
+ tfar = _mm_min_ps(tfar, t2); /* want smallest Tfar */
+
+ mask = _mm_and_ps(mask,
+ _mm_and_ps(_mm_cmple_ps(tnear, tfar), _mm_cmpge_ps(tfar, mZero)));
+ if (!_mm_movemask_ps(mask))
+ return mask;
+ }
+
+ a = tnear;
+ b = tfar;
+ return mask;
+}
--- a/src/shapes.cc Sun Apr 27 14:19:37 2008 +0200
+++ b/src/shapes.cc Sun Apr 27 19:56:23 2008 +0200
@@ -377,15 +377,15 @@
const __m128 beta = _mm_add_ps(_mm_mul_ps(hv, _mm_set_ps1(bnu)),
_mm_mul_ps(hu, _mm_set_ps1(bnv)));
- mask = _mm_and_ps(mask, _mm_cmpge_ps(beta, zeros));
+ mask = _mm_and_ps(mask, _mm_cmpge_ps(beta, mZero));
if (!_mm_movemask_ps(mask))
return mask;
const __m128 gamma = _mm_add_ps(_mm_mul_ps(hu, _mm_set_ps1(cnv)),
_mm_mul_ps(hv, _mm_set_ps1(cnu)));
- mask = _mm_and_ps(mask, _mm_and_ps(_mm_cmpge_ps(gamma, zeros),
- _mm_cmple_ps(_mm_add_ps(beta, gamma), ones)));
+ mask = _mm_and_ps(mask, _mm_and_ps(_mm_cmpge_ps(gamma, mZero),
+ _mm_cmple_ps(_mm_add_ps(beta, gamma), mOne)));
if (!_mm_movemask_ps(mask))
return mask;