52 } |
52 } |
53 } |
53 } |
54 return false; |
54 return false; |
55 } |
55 } |
56 |
56 |
57 #ifndef NO_SSE |
57 #ifndef NO_SIMD |
58 __m128 Sphere::intersect_packet(const RayPacket &rays, __m128 &dists) |
58 mfloat4 Sphere::intersect_packet(const RayPacket &rays, mfloat4 &dists) const |
59 { |
59 { |
60 VectorPacket V = rays.o - VectorPacket(center); |
60 VectorPacket V = rays.o - VectorPacket(center); |
61 register __m128 d = _mm_sub_ps(mZero, dot(V, rays.dir)); |
61 register mfloat4 d = msub(mZero, dot(V, rays.dir)); |
62 register __m128 Det = _mm_sub_ps(_mm_mul_ps(d, d), |
62 register mfloat4 Det = msub(mmul(d, d), msub(dot(V,V), mset1(sqr_radius))); |
63 _mm_sub_ps(dot(V,V), _mm_set_ps1(sqr_radius))); |
63 register mfloat4 t1, t2, mask; |
64 register __m128 t1, t2, mask; |
64 |
65 |
65 mask = mcmpgt(Det, mZero); |
66 mask = _mm_cmpgt_ps(Det, mZero); |
66 if (!mmovemask(mask)) |
67 if (!_mm_movemask_ps(mask)) |
|
68 return mask; |
67 return mask; |
69 |
68 |
70 Det = _mm_sqrt_ps(Det); |
69 Det = msqrt(Det); |
71 t1 = _mm_sub_ps(d, Det); |
70 t1 = msub(d, Det); |
72 t2 = _mm_add_ps(d, Det); |
71 t2 = madd(d, Det); |
73 |
72 |
74 mask = _mm_and_ps(mask, _mm_cmpgt_ps(t2, mZero)); |
73 mask = mand(mask, mcmpgt(t2, mZero)); |
75 |
74 |
76 const __m128 cond1 = _mm_and_ps(_mm_cmpgt_ps(t1, mZero), _mm_cmplt_ps(t1, dists)); |
75 const mfloat4 cond1 = mand(mcmpgt(t1, mZero), mcmplt(t1, dists)); |
77 const __m128 cond2 = _mm_and_ps(_mm_cmple_ps(t1, mZero), _mm_cmplt_ps(t2, dists)); |
76 const mfloat4 cond2 = mand(mcmple(t1, mZero), mcmplt(t2, dists)); |
78 const __m128 newdists = _mm_or_ps(_mm_and_ps(cond1, t1), _mm_and_ps(cond2, t2)); |
77 const mfloat4 newdists = mor(mand(cond1, t1), mand(cond2, t2)); |
79 mask = _mm_and_ps(mask, _mm_or_ps(cond1, cond2)); |
78 mask = mand(mask, mor(cond1, cond2)); |
80 dists = _mm_or_ps(_mm_and_ps(mask, newdists), _mm_andnot_ps(mask, dists)); |
79 dists = mselect(mask, newdists, dists); |
81 return mask; |
80 return mask; |
82 } |
81 } |
83 #endif |
82 #endif |
84 |
83 |
85 /* if there should be CSG sometimes, this may be needed... */ |
84 /* if there should be CSG sometimes, this may be needed... */ |
175 return true; |
174 return true; |
176 } |
175 } |
177 return false; |
176 return false; |
178 } |
177 } |
179 |
178 |
180 #ifndef NO_SSE |
179 #ifndef NO_SIMD |
181 __m128 Box::intersect_packet(const RayPacket &rays, __m128 &dists) |
180 mfloat4 Box::intersect_packet(const RayPacket &rays, mfloat4 &dists) const |
182 { |
181 { |
183 register __m128 tnear = mZero; |
182 mfloat4 origin = rays.o.ma[0]; |
184 register __m128 tfar = mInf; |
183 mfloat4 invdir = rays.invdir.ma[0]; |
185 register __m128 t1, t2; |
184 mfloat4 t1 = mmul(msub(mset1(L[0]), origin), invdir); |
186 register __m128 mask = mAllSet; |
185 mfloat4 t2 = mmul(msub(mset1(H[0]), origin), invdir); |
187 |
186 mfloat4 tmin = mmin(t1, t2); |
188 for (int i = 0; i < 3; i++) |
187 mfloat4 tmax = mmax(t1, t2); |
189 { |
188 |
190 const __m128 mL = _mm_set_ps1(L[i]); |
189 origin = rays.o.ma[1]; |
191 const __m128 mH = _mm_set_ps1(H[i]); |
190 invdir = rays.invdir.ma[1]; |
192 mask = _mm_and_ps(mask, |
191 t1 = mmul(msub(mset1(L[1]), origin), invdir); |
193 _mm_or_ps( |
192 t2 = mmul(msub(mset1(H[1]), origin), invdir); |
194 _mm_or_ps(_mm_cmplt_ps(rays.dir.ma[i], mMEps), _mm_cmpgt_ps(rays.dir.ma[i], mEps)), |
193 tmin = mmax(mmin(t1, t2), tmin); |
195 _mm_and_ps(_mm_cmpge_ps(rays.o.ma[i], mL), _mm_cmple_ps(rays.o.ma[i], mH)) |
194 tmax = mmin(mmax(t1, t2), tmax); |
196 )); |
195 |
197 if (!_mm_movemask_ps(mask)) |
196 origin = rays.o.ma[2]; |
198 return mask; |
197 invdir = rays.invdir.ma[2]; |
199 |
198 t1 = mmul(msub(mset1(L[2]), origin), invdir); |
200 /* compute the intersection distance of the planes */ |
199 t2 = mmul(msub(mset1(H[2]), origin), invdir); |
201 t1 = _mm_div_ps(_mm_sub_ps(mL, rays.o.ma[i]), rays.dir.ma[i]); |
200 tmin = mmax(mmin(t1, t2), tmin); |
202 t2 = _mm_div_ps(_mm_sub_ps(mH, rays.o.ma[i]), rays.dir.ma[i]); |
201 tmax = mmin(mmax(t1, t2), tmax); |
203 |
202 |
204 __m128 t = _mm_min_ps(t1, t2); |
203 mfloat4 mask = mand(mand(mcmplt(tmin, tmax), mcmpgt(tmax, mZero)), mcmplt(tmin, dists)); |
205 t2 = _mm_max_ps(t1, t2); |
204 dists = mselect(mask, tmin, dists); |
206 t1 = t; |
|
207 |
|
208 tnear = _mm_max_ps(tnear, t1); /* want largest Tnear */ |
|
209 tfar = _mm_min_ps(tfar, t2); /* want smallest Tfar */ |
|
210 |
|
211 mask = _mm_and_ps(mask, |
|
212 _mm_and_ps(_mm_cmple_ps(tnear, tfar), _mm_cmpge_ps(tfar, mZero))); |
|
213 if (!_mm_movemask_ps(mask)) |
|
214 return mask; |
|
215 } |
|
216 |
|
217 mask = _mm_and_ps(mask, _mm_cmplt_ps(tnear, dists)); |
|
218 dists = _mm_or_ps(_mm_and_ps(mask, tnear), _mm_andnot_ps(mask, dists)); |
|
219 return mask; |
205 return mask; |
220 } |
206 } |
221 #endif |
207 #endif |
222 |
208 |
223 bool Box::intersect_bbox(const BBox &bbox) const |
209 bool Box::intersect_bbox(const BBox &bbox) const |
417 dist = t; |
403 dist = t; |
418 return true; |
404 return true; |
419 #endif |
405 #endif |
420 } |
406 } |
421 |
407 |
422 #if not defined(NO_SSE) and defined(TRI_BARI_PRE) |
408 #if !defined(NO_SIMD) && defined(TRI_BARI_PRE) |
423 __m128 Triangle::intersect_packet(const RayPacket &rays, __m128 &dists) |
409 mfloat4 Triangle::intersect_packet(const RayPacket &rays, mfloat4 &dists) const |
424 { |
410 { |
425 static const int modulo3[5] = {0,1,2,0,1}; |
411 static const int modulo3[5] = {0,1,2,0,1}; |
426 register const int u = modulo3[k+1]; |
412 register const int u = modulo3[k+1]; |
427 register const int v = modulo3[k+2]; |
413 register const int v = modulo3[k+2]; |
428 __m128 mask; |
414 mfloat4 mask; |
429 |
415 |
430 const __m128 t = _mm_div_ps( |
416 const mfloat4 t = mdiv( |
431 _mm_sub_ps(_mm_sub_ps( |
417 msub(msub( |
432 _mm_sub_ps(_mm_set_ps1(nd), rays.o.ma[k]), |
418 msub(mset1(nd), rays.o.ma[k]), |
433 _mm_mul_ps(_mm_set_ps1(nu), rays.o.ma[u]) |
419 mmul(mset1(nu), rays.o.ma[u]) |
434 ), _mm_mul_ps(_mm_set_ps1(nv), rays.o.ma[v])), |
420 ), mmul(mset1(nv), rays.o.ma[v])), |
435 _mm_add_ps(rays.dir.ma[k], |
421 madd(rays.dir.ma[k], |
436 _mm_add_ps(_mm_mul_ps(_mm_set_ps1(nu), rays.dir.ma[u]), |
422 madd(mmul(mset1(nu), rays.dir.ma[u]), |
437 _mm_mul_ps(_mm_set_ps1(nv), rays.dir.ma[v]))) |
423 mmul(mset1(nv), rays.dir.ma[v]))) |
438 ); |
424 ); |
439 |
425 |
440 mask = _mm_and_ps(_mm_cmplt_ps(t, dists), _mm_cmpge_ps(t, mEps)); |
426 mask = mand(mcmplt(t, dists), mcmpge(t, mEps)); |
441 if (!_mm_movemask_ps(mask)) |
427 if (!mmovemask(mask)) |
442 return mask; |
428 return mask; |
443 |
429 |
444 const __m128 hu = _mm_sub_ps(_mm_add_ps(rays.o.ma[u], |
430 const mfloat4 hu = msub(madd(rays.o.ma[u], |
445 _mm_mul_ps(t, rays.dir.ma[u])), _mm_set_ps1(A->P[u])); |
431 mmul(t, rays.dir.ma[u])), mset1(A->P[u])); |
446 const __m128 hv = _mm_sub_ps(_mm_add_ps(rays.o.ma[v], |
432 const mfloat4 hv = msub(madd(rays.o.ma[v], |
447 _mm_mul_ps(t, rays.dir.ma[v])), _mm_set_ps1(A->P[v])); |
433 mmul(t, rays.dir.ma[v])), mset1(A->P[v])); |
448 const __m128 beta = _mm_add_ps(_mm_mul_ps(hv, _mm_set_ps1(bnu)), |
434 const mfloat4 beta = madd(mmul(hv, mset1(bnu)), |
449 _mm_mul_ps(hu, _mm_set_ps1(bnv))); |
435 mmul(hu, mset1(bnv))); |
450 |
436 |
451 mask = _mm_and_ps(mask, _mm_cmpge_ps(beta, mZero)); |
437 mask = mand(mask, mcmpge(beta, mZero)); |
452 if (!_mm_movemask_ps(mask)) |
438 if (!mmovemask(mask)) |
453 return mask; |
439 return mask; |
454 |
440 |
455 const __m128 gamma = _mm_add_ps(_mm_mul_ps(hu, _mm_set_ps1(cnv)), |
441 const mfloat4 gamma = madd(mmul(hu, mset1(cnv)), |
456 _mm_mul_ps(hv, _mm_set_ps1(cnu))); |
442 mmul(hv, mset1(cnu))); |
457 |
443 |
458 mask = _mm_and_ps(mask, _mm_and_ps(_mm_cmpge_ps(gamma, mZero), |
444 mask = mand(mask, mand(mcmpge(gamma, mZero), |
459 _mm_cmple_ps(_mm_add_ps(beta, gamma), mOne))); |
445 mcmple(madd(beta, gamma), mOne))); |
460 if (!_mm_movemask_ps(mask)) |
446 if (!mmovemask(mask)) |
461 return mask; |
447 return mask; |
462 |
448 |
463 dists = _mm_or_ps(_mm_andnot_ps(mask, dists), _mm_and_ps(mask, t)); |
449 dists = mselect(mask, t, dists); |
464 return mask; |
450 return mask; |
465 } |
451 } |
466 #endif |
452 #endif |
467 |
453 |
468 bool Triangle::intersect_bbox(const BBox &bbox) const |
454 bool Triangle::intersect_bbox(const BBox &bbox) const |