diff -r 9d66d323c354 -r 9af5c039b678 src/kdtree.cc --- a/src/kdtree.cc Fri May 02 13:27:47 2008 +0200 +++ b/src/kdtree.cc Mon May 05 15:31:14 2008 +0200 @@ -64,12 +64,10 @@ { if (isLeaf()) delete getShapes(); - else - delete[] getLeftChild(); } // kd-tree recursive build algorithm, inspired by PBRT (www.pbrt.org) -void KdTree::recursive_build(KdNode *node, BBox bounds, int maxdepth) +void KdTree::recursive_build(KdNode *node, const BBox &bounds, int maxdepth) { ShapeList *shapes = node->getShapes(); @@ -102,12 +100,12 @@ sort(edges[ax].begin(), edges[ax].end()); // choose best split pos - const Float K = 1.4; // constant, K = cost of traversal / cost of ray-triangle intersection + const Float K = 1.4f; // constant, K = cost of traversal / cost of ray-triangle intersection Float SAV = (bounds.w()*bounds.h() + // surface area of node bounds.w()*bounds.d() + bounds.h()*bounds.d()); Float cost = SAV * (K + shapes->size()); // initial cost = non-split cost - vector::iterator edge, splitedge = edges[2].end(); + vector::iterator edge, splitedge = edges[0].end(); int axis = 0; for (int ax = 0; ax < 3; ax++) { @@ -120,8 +118,8 @@ rnum--; // calculate SAH cost of this split - lbb.H.cell[ax] = edge->pos; - rbb.L.cell[ax] = edge->pos; + lbb.H[ax] = edge->pos; + rbb.L[ax] = edge->pos; Float SAL = (lbb.w()*lbb.h() + lbb.w()*lbb.d() + lbb.h()*lbb.d()); Float SAR = (rbb.w()*rbb.h() + rbb.w()*rbb.d() + rbb.h()*rbb.d()); Float splitcost = K*SAV + SAL*(K + lnum) + SAR*(K + rnum); @@ -138,7 +136,10 @@ } } - if (splitedge == edges[2].end()) + // we actually need to compare with edges[0].end(), but + // MSVC does not allow comparison of iterators from differen instances of vector + // it's OK this way, because axis will be zero if no good split was found + if (splitedge == edges[axis].end()) { node->setLeaf(); return; @@ -152,18 +153,18 @@ static ofstream F("kdtree.obj"); Vector v; static int f=0; - v.cell[axis] = node->getSplit(); - v.cell[(axis+1)%3] = bounds.L.cell[(axis+1)%3]; - v.cell[(axis+2)%3] = bounds.L.cell[(axis+2)%3]; + v[axis] = node->getSplit(); + v[(axis+1)%3] = bounds.L[(axis+1)%3]; + v[(axis+2)%3] = bounds.L[(axis+2)%3]; F << "v " << v.x << " " << v.y << " " << v.z << endl; - v.cell[(axis+1)%3] = bounds.L.cell[(axis+1)%3]; - v.cell[(axis+2)%3] = bounds.H.cell[(axis+2)%3]; + v[(axis+1)%3] = bounds.L[(axis+1)%3]; + v[(axis+2)%3] = bounds.H[(axis+2)%3]; F << "v " << v.x << " " << v.y << " " << v.z << endl; - v.cell[(axis+1)%3] = bounds.H.cell[(axis+1)%3]; - v.cell[(axis+2)%3] = bounds.H.cell[(axis+2)%3]; + v[(axis+1)%3] = bounds.H[(axis+1)%3]; + v[(axis+2)%3] = bounds.H[(axis+2)%3]; F << "v " << v.x << " " << v.y << " " << v.z << endl; - v.cell[(axis+1)%3] = bounds.H.cell[(axis+1)%3]; - v.cell[(axis+2)%3] = bounds.L.cell[(axis+2)%3]; + v[(axis+1)%3] = bounds.H[(axis+1)%3]; + v[(axis+2)%3] = bounds.L[(axis+2)%3]; F << "v " << v.x << " " << v.y << " " << v.z << endl; F << "f " << f+1 << " " << f+2 << " " << f+3 << " " << f+4 << endl; f += 4; @@ -174,9 +175,10 @@ BBox lbb = bounds; BBox rbb = bounds; - lbb.H.cell[axis] = node->getSplit(); - rbb.L.cell[axis] = node->getSplit(); - node->setChildren(new KdNode[2]); + lbb.H[axis] = node->getSplit(); + rbb.L[axis] = node->getSplit(); + node->setChildren(new (mempool.alloc()) KdNode); + new (mempool.alloc()) KdNode; node->setAxis(axis); for (edge = edges[axis].begin(); edge != splitedge; edge++) @@ -219,7 +221,12 @@ KdNode *farchild, *node; node = root; +#ifdef MSVC + // MSVC wants constant expression here... hope it won't overflow :) + StackElem stack[64]; +#else StackElem stack[max_depth]; +#endif int entry = 0, exit = 1; stack[entry].t = a; @@ -294,11 +301,11 @@ stack[exit].prev = tmp; stack[exit].t = t; stack[exit].node = farchild; - stack[exit].pb.cell[axis] = splitVal; - stack[exit].pb.cell[mod3[axis+1]] = ray.o.cell[mod3[axis+1]] - + t * ray.dir.cell[mod3[axis+1]]; - stack[exit].pb.cell[mod3[axis+2]] = ray.o.cell[mod3[axis+2]] - + t * ray.dir.cell[mod3[axis+2]]; + stack[exit].pb[axis] = splitVal; + stack[exit].pb[mod3[axis+1]] = ray.o[mod3[axis+1]] + + t * ray.dir[mod3[axis+1]]; + stack[exit].pb[mod3[axis+2]] = ray.o[mod3[axis+2]] + + t * ray.dir[mod3[axis+2]]; } /* current node is the leaf . . . empty or full */ @@ -328,22 +335,22 @@ return NULL; } -#ifndef NO_SSE +#ifndef NO_SIMD // stack element for kd-tree traversal (packet version) struct StackElem4 { KdNode* node; /* pointer to far child */ - __m128 t; /* the entry/exit signed distance */ + mfloat4 t; /* the entry/exit signed distance */ VectorPacket pb; /* the coordinates of entry/exit point */ int prev; }; -void KdTree::packet_intersection(const Shape **origin_shapes, const RayPacket &rays, +void KdTree::packet_intersection(const Shape* const* origin_shapes, const RayPacket &rays, Float *nearest_distances, Shape **nearest_shapes) { - __m128 a, b; /* entry/exit signed distance */ - __m128 t; /* signed distance to the splitting plane */ - __m128 mask = mZero; + mfloat4 a, b; /* entry/exit signed distance */ + mfloat4 t; /* signed distance to the splitting plane */ + mfloat4 mask = mZero; /* if we have no tree, fall back to naive test */ if (!built) @@ -353,27 +360,29 @@ memset(nearest_shapes, 0, 4*sizeof(Shape*)); mask = bbox.intersect_packet(rays, a, b); - if (!_mm_movemask_ps(mask)) + if (!mmovemask(mask)) return; /* pointers to the far child node and current node */ KdNode *farchild, *node; node = root; +#ifdef MSVC + // MSVC wants constant expression here... hope it won't overflow :) + StackElem4 stack[64]; +#else StackElem4 stack[max_depth]; +#endif int entry = 0, exit = 1; stack[entry].t = a; /* distinguish between internal and external origin of a ray*/ - t = _mm_cmplt_ps(a, mZero); + t = mcmplt(a, mZero); stack[entry].pb = rays.o + rays.dir * a; - stack[entry].pb.mx = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.mx), - _mm_and_ps(t, rays.o.mx)); - stack[entry].pb.my = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.my), - _mm_and_ps(t, rays.o.my)); - stack[entry].pb.mz = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.mz), - _mm_and_ps(t, rays.o.mz)); + stack[entry].pb.mx = mselect(t, rays.o.mx, stack[entry].pb.mx); + stack[entry].pb.my = mselect(t, rays.o.my, stack[entry].pb.my); + stack[entry].pb.mz = mselect(t, rays.o.mz, stack[entry].pb.mz); /* setup initial exit point in the stack */ stack[exit].t = b; @@ -382,7 +391,7 @@ /* loop, traverse through the whole kd-tree, until an object is intersected or ray leaves the scene */ - __m128 splitVal; + mfloat4 splitVal; int axis; static const int mod3[] = {0,1,2,0,1}; const VectorPacket invdirs = mOne / rays.dir; @@ -392,21 +401,21 @@ while (!node->isLeaf()) { /* retrieve position of splitting plane */ - splitVal = _mm_set_ps1(node->getSplit()); + splitVal = mset1(node->getSplit()); axis = node->getAxis(); // mask out invalid rays with near > far - const __m128 curmask = _mm_and_ps(mask, _mm_cmple_ps(stack[entry].t, stack[exit].t)); - const __m128 entry_lt = _mm_cmplt_ps(stack[entry].pb.ma[axis], splitVal); - const __m128 entry_gt = _mm_cmpgt_ps(stack[entry].pb.ma[axis], splitVal); - const __m128 exit_lt = _mm_cmplt_ps(stack[exit].pb.ma[axis], splitVal); - const __m128 exit_gt = _mm_cmpgt_ps(stack[exit].pb.ma[axis], splitVal); + const mfloat4 curmask = mand(mask, mcmple(stack[entry].t, stack[exit].t)); + const mfloat4 entry_lt = mcmplt(stack[entry].pb.ma[axis], splitVal); + const mfloat4 entry_gt = mcmpgt(stack[entry].pb.ma[axis], splitVal); + const mfloat4 exit_lt = mcmplt(stack[exit].pb.ma[axis], splitVal); + const mfloat4 exit_gt = mcmpgt(stack[exit].pb.ma[axis], splitVal); // if all of // stack[entry].pb[axis] <= splitVal, // stack[exit].pb[axis] <= splitVal - if (!_mm_movemask_ps( - _mm_and_ps(_mm_or_ps(entry_gt, exit_gt), curmask))) + if (!mmovemask( + mand(mor(entry_gt, exit_gt), curmask))) { node = node->getLeftChild(); continue; @@ -415,8 +424,8 @@ // if all of // stack[entry].pb[axis] >= splitVal, // stack[exit].pb[axis] >= splitVal - if (!_mm_movemask_ps( - _mm_and_ps(_mm_or_ps(entry_lt, exit_lt), curmask))) + if (!mmovemask( + mand(mor(entry_lt, exit_lt), curmask))) { node = node->getRightChild(); continue; @@ -425,14 +434,14 @@ // any of // stack[entry].pb[axis] < splitVal, // stack[exit].pb[axis] > splitVal - bool cond1 = _mm_movemask_ps( - _mm_and_ps(_mm_and_ps(entry_lt, exit_gt), curmask)); + int cond1 = mmovemask( + mand(mand(entry_lt, exit_gt), curmask)); // any of // stack[entry].pb[axis] > splitVal, // stack[exit].pb[axis] < splitVal - bool cond2 = _mm_movemask_ps( - _mm_and_ps(_mm_and_ps(entry_gt, exit_lt), curmask)); + int cond2 = mmovemask( + mand(mand(entry_gt, exit_lt), curmask)); if ((!cond1 && !cond2) || (cond1 && cond2)) { @@ -459,7 +468,7 @@ /* traverse both children */ /* signed distance to the splitting plane */ - t = _mm_mul_ps(_mm_sub_ps(splitVal, rays.o.ma[axis]), invdirs.ma[axis]); + t = mmul(msub(splitVal, rays.o.ma[axis]), invdirs.ma[axis]); /* setup the new exit point and push it onto stack */ register int tmp = exit; @@ -474,22 +483,22 @@ stack[exit].node = farchild; stack[exit].pb.ma[axis] = splitVal; stack[exit].pb.ma[mod3[axis+1]] = - _mm_add_ps(rays.o.ma[mod3[axis+1]], _mm_mul_ps(t, rays.dir.ma[mod3[axis+1]])); + madd(rays.o.ma[mod3[axis+1]], mmul(t, rays.dir.ma[mod3[axis+1]])); stack[exit].pb.ma[mod3[axis+2]] = - _mm_add_ps(rays.o.ma[mod3[axis+2]], _mm_mul_ps(t, rays.dir.ma[mod3[axis+2]])); + madd(rays.o.ma[mod3[axis+2]], mmul(t, rays.dir.ma[mod3[axis+2]])); } /* current node is the leaf . . . empty or full */ - __m128 dists = stack[exit].t; + mfloat4 dists = stack[exit].t; ShapeList::iterator shape; - __m128 results; - __m128 newmask = mask; + mfloat4 results; + mfloat4 newmask = mask; for (shape = node->getShapes()->begin(); shape != node->getShapes()->end(); shape++) { results = (*shape)->intersect_packet(rays, dists); - int valid = _mm_movemask_ps( - _mm_and_ps(mask, _mm_and_ps(results, - _mm_cmpge_ps(dists, _mm_sub_ps(stack[entry].t, mEps))))); + int valid = mmovemask( + mand(mask, mand(results, + mcmpge(dists, msub(stack[entry].t, mEps))))); for (int i = 0; i < 4; i++) { if (*shape != origin_shapes[i] && ((valid>>i)&1)) @@ -502,7 +511,7 @@ } mask = newmask; - if (!_mm_movemask_ps(mask)) + if (!mmovemask(mask)) return; entry = exit;