diff -r 9d66d323c354 -r 9af5c039b678 src/shapes.cc --- a/src/shapes.cc Fri May 02 13:27:47 2008 +0200 +++ b/src/shapes.cc Mon May 05 15:31:14 2008 +0200 @@ -54,30 +54,29 @@ return false; } -#ifndef NO_SSE -__m128 Sphere::intersect_packet(const RayPacket &rays, __m128 &dists) +#ifndef NO_SIMD +mfloat4 Sphere::intersect_packet(const RayPacket &rays, mfloat4 &dists) const { VectorPacket V = rays.o - VectorPacket(center); - register __m128 d = _mm_sub_ps(mZero, dot(V, rays.dir)); - register __m128 Det = _mm_sub_ps(_mm_mul_ps(d, d), - _mm_sub_ps(dot(V,V), _mm_set_ps1(sqr_radius))); - register __m128 t1, t2, mask; + register mfloat4 d = msub(mZero, dot(V, rays.dir)); + register mfloat4 Det = msub(mmul(d, d), msub(dot(V,V), mset1(sqr_radius))); + register mfloat4 t1, t2, mask; - mask = _mm_cmpgt_ps(Det, mZero); - if (!_mm_movemask_ps(mask)) + mask = mcmpgt(Det, mZero); + if (!mmovemask(mask)) return mask; - Det = _mm_sqrt_ps(Det); - t1 = _mm_sub_ps(d, Det); - t2 = _mm_add_ps(d, Det); + Det = msqrt(Det); + t1 = msub(d, Det); + t2 = madd(d, Det); - mask = _mm_and_ps(mask, _mm_cmpgt_ps(t2, mZero)); + mask = mand(mask, mcmpgt(t2, mZero)); - const __m128 cond1 = _mm_and_ps(_mm_cmpgt_ps(t1, mZero), _mm_cmplt_ps(t1, dists)); - const __m128 cond2 = _mm_and_ps(_mm_cmple_ps(t1, mZero), _mm_cmplt_ps(t2, dists)); - const __m128 newdists = _mm_or_ps(_mm_and_ps(cond1, t1), _mm_and_ps(cond2, t2)); - mask = _mm_and_ps(mask, _mm_or_ps(cond1, cond2)); - dists = _mm_or_ps(_mm_and_ps(mask, newdists), _mm_andnot_ps(mask, dists)); + const mfloat4 cond1 = mand(mcmpgt(t1, mZero), mcmplt(t1, dists)); + const mfloat4 cond2 = mand(mcmple(t1, mZero), mcmplt(t2, dists)); + const mfloat4 newdists = mor(mand(cond1, t1), mand(cond2, t2)); + mask = mand(mask, mor(cond1, cond2)); + dists = mselect(mask, newdists, dists); return mask; } #endif @@ -177,45 +176,32 @@ return false; } -#ifndef NO_SSE -__m128 Box::intersect_packet(const RayPacket &rays, __m128 &dists) +#ifndef NO_SIMD +mfloat4 Box::intersect_packet(const RayPacket &rays, mfloat4 &dists) const { - register __m128 tnear = mZero; - register __m128 tfar = mInf; - register __m128 t1, t2; - register __m128 mask = mAllSet; - - for (int i = 0; i < 3; i++) - { - const __m128 mL = _mm_set_ps1(L[i]); - const __m128 mH = _mm_set_ps1(H[i]); - mask = _mm_and_ps(mask, - _mm_or_ps( - _mm_or_ps(_mm_cmplt_ps(rays.dir.ma[i], mMEps), _mm_cmpgt_ps(rays.dir.ma[i], mEps)), - _mm_and_ps(_mm_cmpge_ps(rays.o.ma[i], mL), _mm_cmple_ps(rays.o.ma[i], mH)) - )); - if (!_mm_movemask_ps(mask)) - return mask; + mfloat4 origin = rays.o.ma[0]; + mfloat4 invdir = rays.invdir.ma[0]; + mfloat4 t1 = mmul(msub(mset1(L[0]), origin), invdir); + mfloat4 t2 = mmul(msub(mset1(H[0]), origin), invdir); + mfloat4 tmin = mmin(t1, t2); + mfloat4 tmax = mmax(t1, t2); - /* compute the intersection distance of the planes */ - t1 = _mm_div_ps(_mm_sub_ps(mL, rays.o.ma[i]), rays.dir.ma[i]); - t2 = _mm_div_ps(_mm_sub_ps(mH, rays.o.ma[i]), rays.dir.ma[i]); - - __m128 t = _mm_min_ps(t1, t2); - t2 = _mm_max_ps(t1, t2); - t1 = t; + origin = rays.o.ma[1]; + invdir = rays.invdir.ma[1]; + t1 = mmul(msub(mset1(L[1]), origin), invdir); + t2 = mmul(msub(mset1(H[1]), origin), invdir); + tmin = mmax(mmin(t1, t2), tmin); + tmax = mmin(mmax(t1, t2), tmax); - tnear = _mm_max_ps(tnear, t1); /* want largest Tnear */ - tfar = _mm_min_ps(tfar, t2); /* want smallest Tfar */ + origin = rays.o.ma[2]; + invdir = rays.invdir.ma[2]; + t1 = mmul(msub(mset1(L[2]), origin), invdir); + t2 = mmul(msub(mset1(H[2]), origin), invdir); + tmin = mmax(mmin(t1, t2), tmin); + tmax = mmin(mmax(t1, t2), tmax); - mask = _mm_and_ps(mask, - _mm_and_ps(_mm_cmple_ps(tnear, tfar), _mm_cmpge_ps(tfar, mZero))); - if (!_mm_movemask_ps(mask)) - return mask; - } - - mask = _mm_and_ps(mask, _mm_cmplt_ps(tnear, dists)); - dists = _mm_or_ps(_mm_and_ps(mask, tnear), _mm_andnot_ps(mask, dists)); + mfloat4 mask = mand(mand(mcmplt(tmin, tmax), mcmpgt(tmax, mZero)), mcmplt(tmin, dists)); + dists = mselect(mask, tmin, dists); return mask; } #endif @@ -419,48 +405,48 @@ #endif } -#if not defined(NO_SSE) and defined(TRI_BARI_PRE) -__m128 Triangle::intersect_packet(const RayPacket &rays, __m128 &dists) +#if !defined(NO_SIMD) && defined(TRI_BARI_PRE) +mfloat4 Triangle::intersect_packet(const RayPacket &rays, mfloat4 &dists) const { static const int modulo3[5] = {0,1,2,0,1}; register const int u = modulo3[k+1]; register const int v = modulo3[k+2]; - __m128 mask; + mfloat4 mask; - const __m128 t = _mm_div_ps( - _mm_sub_ps(_mm_sub_ps( - _mm_sub_ps(_mm_set_ps1(nd), rays.o.ma[k]), - _mm_mul_ps(_mm_set_ps1(nu), rays.o.ma[u]) - ), _mm_mul_ps(_mm_set_ps1(nv), rays.o.ma[v])), - _mm_add_ps(rays.dir.ma[k], - _mm_add_ps(_mm_mul_ps(_mm_set_ps1(nu), rays.dir.ma[u]), - _mm_mul_ps(_mm_set_ps1(nv), rays.dir.ma[v]))) + const mfloat4 t = mdiv( + msub(msub( + msub(mset1(nd), rays.o.ma[k]), + mmul(mset1(nu), rays.o.ma[u]) + ), mmul(mset1(nv), rays.o.ma[v])), + madd(rays.dir.ma[k], + madd(mmul(mset1(nu), rays.dir.ma[u]), + mmul(mset1(nv), rays.dir.ma[v]))) ); - mask = _mm_and_ps(_mm_cmplt_ps(t, dists), _mm_cmpge_ps(t, mEps)); - if (!_mm_movemask_ps(mask)) + mask = mand(mcmplt(t, dists), mcmpge(t, mEps)); + if (!mmovemask(mask)) return mask; - const __m128 hu = _mm_sub_ps(_mm_add_ps(rays.o.ma[u], - _mm_mul_ps(t, rays.dir.ma[u])), _mm_set_ps1(A->P[u])); - const __m128 hv = _mm_sub_ps(_mm_add_ps(rays.o.ma[v], - _mm_mul_ps(t, rays.dir.ma[v])), _mm_set_ps1(A->P[v])); - const __m128 beta = _mm_add_ps(_mm_mul_ps(hv, _mm_set_ps1(bnu)), - _mm_mul_ps(hu, _mm_set_ps1(bnv))); + const mfloat4 hu = msub(madd(rays.o.ma[u], + mmul(t, rays.dir.ma[u])), mset1(A->P[u])); + const mfloat4 hv = msub(madd(rays.o.ma[v], + mmul(t, rays.dir.ma[v])), mset1(A->P[v])); + const mfloat4 beta = madd(mmul(hv, mset1(bnu)), + mmul(hu, mset1(bnv))); - mask = _mm_and_ps(mask, _mm_cmpge_ps(beta, mZero)); - if (!_mm_movemask_ps(mask)) + mask = mand(mask, mcmpge(beta, mZero)); + if (!mmovemask(mask)) return mask; - const __m128 gamma = _mm_add_ps(_mm_mul_ps(hu, _mm_set_ps1(cnv)), - _mm_mul_ps(hv, _mm_set_ps1(cnu))); + const mfloat4 gamma = madd(mmul(hu, mset1(cnv)), + mmul(hv, mset1(cnu))); - mask = _mm_and_ps(mask, _mm_and_ps(_mm_cmpge_ps(gamma, mZero), - _mm_cmple_ps(_mm_add_ps(beta, gamma), mOne))); - if (!_mm_movemask_ps(mask)) + mask = mand(mask, mand(mcmpge(gamma, mZero), + mcmple(madd(beta, gamma), mOne))); + if (!mmovemask(mask)) return mask; - dists = _mm_or_ps(_mm_andnot_ps(mask, dists), _mm_and_ps(mask, t)); + dists = mselect(mask, t, dists); return mask; } #endif @@ -580,13 +566,13 @@ v=v0[q]; if(N[q]>0.0f) { - vmin.cell[q]=-boxhalfsize[q] - v; - vmax.cell[q]= boxhalfsize[q] - v; + vmin[q]=-boxhalfsize[q] - v; + vmax[q]= boxhalfsize[q] - v; } else { - vmin.cell[q]= boxhalfsize[q] - v; - vmax.cell[q]=-boxhalfsize[q] - v; + vmin[q]= boxhalfsize[q] - v; + vmax[q]=-boxhalfsize[q] - v; } } if(dot(N,vmin)>0.0f) return false;