packetize Phong shader
new scons config options:
simd=(yes|no) - allow/suppress explicit SSE
force_flags=(yes|no) - force use of specified flags instead of autodetected
profile=(yes|no) - enable gcc's profiling (-pg option)
check for pthread.h header, don't try to build without it
add fourth Vector3 component for better memory aligning
rename Vector3 to Vector
partialy SSE-ize Vector class (only fully vertical operations)
build static lib and python module in distinctive directories
to avoid collision of library file names on some platforms
/* * raytracer.cc: Raytracer class * * This file is part of Pyrit Ray Tracer. * * Copyright 2006, 2007, 2008 Radek Brich * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */#include <pthread.h>#include <stdio.h>#include <stdlib.h>#include <assert.h>#include "raytracer.h"int pyrit_verbosity = 2;// Hammersley spherical point distribution// http://www.cse.cuhk.edu.hk/~ttwong/papers/udpoint/udpoints.htmlVector Raytracer::SphereDistribute(int i, int n, Float extent, const Vector &normal){ Float p, t, st, phi, phirad; int kk; t = 0; for (p=0.5, kk=i; kk; p*=0.5, kk>>=1) if (kk & 1) t += p; t = 1.0 + (t - 1.0)*extent; phi = (i + 0.5) / n; phirad = phi * 2.0 * M_PI; st = sqrt(1.0 - t*t); Float x, y, z, xx, yy, zz, q; x = st * cos(phirad); y = st * sin(phirad); z = t; // rotate against Y axis q = acos(normal.z); zz = z*cos(q) - x*sin(q); xx = z*sin(q) + x*cos(q); yy = y; // rotate against Z axis q = atan2f(normal.y, normal.x); x = xx*cos(q) - yy*sin(q); y = xx*sin(q) + yy*cos(q); z = zz; return Vector(x, y, z);}/* P is point of intersection, N normal in this point, R direction of reflected ray, V direction to the viewer*/Colour Raytracer::PhongShader(const Shape *shape, const Vector &P, const Vector &N, const Vector &V){ Colour col, acc; Material * const &mat = shape->material; if (mat->texture) col = mat->texture->evaluate(P); else col = mat->colour; // ambient acc = mat->ambient * col; vector<Light*>::iterator light; for (light = lights.begin(); light != lights.end(); light++) { const Vector L = normalize((*light)->pos - P); // direction vector to light const Float L_dot_N = dot(L, N); if (L_dot_N > 0) { // test if this light is occluded (sharp shadows) if ((*light)->cast_shadows) { const Ray shadow_ray = Ray(P, L); Float dist = FLT_MAX; if (top->nearest_intersection(shape, shadow_ray, dist)) continue; } const Vector R = L - 2.0 * L_dot_N * N; const Float R_dot_V = dot(R, V); // diffuse acc += mat->diffuse * col * (*light)->colour * L_dot_N; // specular if (R_dot_V > 0) acc += mat->specular * (*light)->colour * powf(R_dot_V, mat->shininess); } } return acc;}#ifndef NO_SSEVectorPacket Raytracer::PhongShader_packet(const Shape **shapes, const VectorPacket &P, const VectorPacket &N, const VectorPacket &V){ VectorPacket acc, colour; union { __m128 ambient; float ambient_f[4]; }; union { __m128 diffuse; float diffuse_f[4]; }; union { __m128 specular; float specular_f[4]; }; union { __m128 shininess; float shininess_f[4]; }; for (int i = 0; i < 4; i++) if (shapes[i] == NULL) { ambient_f[i] = 0; diffuse_f[i] = 0; specular_f[i] = 0; shininess_f[i] = 0; } else { Material * const &mat = shapes[i]->material; if (mat->texture) colour.setVector(i, mat->texture->evaluate(P.getVector(i))); else colour.setVector(i, mat->colour); ambient_f[i] = mat->ambient; diffuse_f[i] = mat->diffuse; specular_f[i] = mat->specular; shininess_f[i] = mat->shininess; } // ambient acc = colour * ambient; Shape **shadow_shapes; vector<Light*>::iterator light; for (light = lights.begin(); light != lights.end(); light++) { // direction vector to light VectorPacket L = VectorPacket((*light)->pos) - P; L.normalize(); const __m128 L_dot_N = dot(L, N); __m128 valid = _mm_cmpgt_ps(L_dot_N, mZero); // test if this light is occluded (sharp shadows) if ((*light)->cast_shadows) { const RayPacket shadow_rays = RayPacket(P, L); union { __m128 dists; float dists_f[4]; }; dists = mInf; top->packet_intersection(shapes, shadow_rays, dists_f, shadow_shapes); valid = _mm_and_ps(valid, _mm_cmpeq_ps(dists, mInf)); } const VectorPacket R = L - N * _mm_mul_ps(mTwo, L_dot_N); const __m128 R_dot_V = dot(R, V); // diffuse acc.selectiveAdd(valid, colour * VectorPacket((*light)->colour) * _mm_mul_ps(diffuse, L_dot_N)); // specular valid = _mm_and_ps(valid, _mm_cmpgt_ps(R_dot_V, mZero)); __m128 spec = _mm_mul_ps(_mm_mul_ps(specular, _mm_set_ps1((*light)->colour.r)), _mm_fastpow(R_dot_V, shininess)); acc.selectiveAdd(valid, spec); } return acc;}#endifvoid Raytracer::lightScatter(const Ray &ray, const Shape *shape, int depth, const Vector &P, const Vector &normal, bool from_inside, Colour &col){ if (depth < max_depth) { Colour trans_col, refl_col; Float trans = shape->material->transmissivity; Float refl = shape->material->reflectivity; const Float cos_i = - dot(normal, ray.dir); // reflection if (refl > 0.01) { Vector newdir = ray.dir + 2.0 * cos_i * normal; Ray newray = Ray(P, newdir); refl_col = raytrace(newray, depth + 1, shape); } // refraction if (trans > 0.01) { Float n, n1, n2; if (from_inside) { n1 = shape->material->refract_index; n2 = 1.0; n = n1; } else { n1 = 1.0; n2 = shape->material->refract_index; n = 1.0 / n2; } const Float sin2_t = n*n * (1 - cos_i*cos_i); if (sin2_t >= 1.0) { // totally reflected refl += trans; trans = 0; } else { const Float cos_t = sqrtf(1 - sin2_t); const Float Rdiv = 1.0/(n1*cos_i + n2*cos_t); const Float Rper = (n1*cos_i - n2*cos_t)*Rdiv; const Float Rpar = (n2*cos_i - n1*cos_t)*Rdiv; const Float R = (Rper*Rper + Rpar*Rpar)/2; refl += R*trans; trans = (1-R)*trans; Vector newdir = n * ray.dir + (n*cos_i - cos_t) * normal; Ray newray = Ray(P + 0.001*newdir, newdir); trans_col = raytrace(newray, depth + 1, NULL); } } col = (1-refl-trans)*col + refl*refl_col + trans*trans_col; } // ambient occlusion if (ao_samples && !from_inside) { Float miss = 0; for (int i = 0; i < ao_samples; i++) { Vector dir = SphereDistribute(i, ao_samples, ao_angle, normal); Ray ao_ray = Ray(P, dir); Float dist = ao_distance; Shape *shape_in_way = top->nearest_intersection(shape, ao_ray, dist); if (shape_in_way == NULL) miss += 1.0; else miss += dist / ao_distance; } Float ao_intensity = miss / ao_samples; col = col * ao_intensity; }}Colour Raytracer::raytrace(Ray &ray, int depth, const Shape *origin_shape){ Float nearest_distance = Inf; Shape *nearest_shape = top->nearest_intersection(origin_shape, ray, nearest_distance); if (nearest_shape == NULL) return bg_colour; else { const Vector P = ray.o + ray.dir * nearest_distance; // point of intersection Vector normal = nearest_shape->normal(P); bool from_inside = false; // make shapes double sided if (dot(normal, ray.dir) > 0.0) { normal = - normal; from_inside = true; } // shading function Colour col = PhongShader(nearest_shape, P, normal, ray.dir); lightScatter(ray, nearest_shape, depth, P, normal, from_inside, col); return col; }}#ifndef NO_SSEvoid Raytracer::raytracePacket(RayPacket &rays, Colour *results){ union { float nearest_distances[4]; __m128 m_nearest_distances; }; __m128 mask; Shape *nearest_shapes[4]; static const Shape *origin_shapes[4] = {NULL, NULL, NULL, NULL}; m_nearest_distances = mInf; mask = mAllSet; top->packet_intersection(origin_shapes, rays, nearest_distances, nearest_shapes); mask = _mm_cmpneq_ps(m_nearest_distances, mInf); if (!_mm_movemask_ps(mask)) { for (int i = 0; i < 4; i++) results[i] = bg_colour; return; } const VectorPacket P = rays.o + rays.dir * m_nearest_distances; // point of intersection VectorPacket normal; for (int i = 0; i < 4; i++) if (nearest_shapes[i] != NULL) normal.setVector(i, nearest_shapes[i]->normal(P.getVector(i))); // make shapes double sided __m128 from_inside = _mm_cmpgt_ps(dot(normal, rays.dir), mZero); normal.mx = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.mx)), _mm_andnot_ps(from_inside, normal.mx)); normal.my = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.my)), _mm_andnot_ps(from_inside, normal.my)); normal.mz = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.mz)), _mm_andnot_ps(from_inside, normal.mz)); // shading function VectorPacket pres = PhongShader_packet(const_cast<const Shape**>(nearest_shapes), P, normal, rays.dir); //pres.mx = _mm_or_ps(_mm_and_ps(mask, pres.mx), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.r))); //pres.my = _mm_or_ps(_mm_and_ps(mask, pres.my), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.g))); //pres.mz = _mm_or_ps(_mm_and_ps(mask, pres.mz), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.b))); for (int i = 0; i < 4; i++) if (nearest_shapes[i] != NULL) { results[i] = pres.getVector(i); lightScatter(rays[i], nearest_shapes[i], 0, P.getVector(i), normal.getVector(i), (_mm_movemask_ps(from_inside)>>i)&1, results[i]); } else results[i] = bg_colour;}#endifvoid *Raytracer::raytrace_worker(void *d){ static const int my_queue_size = 256; Raytracer *rt = (Raytracer*)d; Sample my_queue[my_queue_size]; Colour my_colours[my_queue_size]; int my_count; Ray ray;#ifndef NO_SSE RayPacket rays; const bool can_use_packets = (rt->use_packets && rt->sampler->packetableSamples());#endif for (;;) { pthread_mutex_lock(&rt->sample_queue_mutex); while (rt->sample_queue_count == 0) { if (rt->end_of_samples) { dbgmsg(4, "T thread [%d] exiting\n", pthread_self()); pthread_mutex_unlock(&rt->sample_queue_mutex); pthread_exit(NULL); } pthread_cond_wait(&rt->sample_queue_cond, &rt->sample_queue_mutex); } if (rt->sample_queue_count >= my_queue_size) my_count = my_queue_size; else my_count = rt->sample_queue_count; rt->sample_queue_count -= my_count; // copy samples to local queue if (rt->sample_queue_pos + my_count >= rt->sample_queue_size) { register int c = rt->sample_queue_size - rt->sample_queue_pos; memcpy(my_queue, rt->sample_queue + rt->sample_queue_pos, c*sizeof(Sample)); memcpy(my_queue + c, rt->sample_queue, (my_count - c)*sizeof(Sample)); rt->sample_queue_pos = my_count - c; } else { memcpy(my_queue, rt->sample_queue + rt->sample_queue_pos, my_count*sizeof(Sample)); rt->sample_queue_pos += my_count; } if (rt->sample_queue_count <= my_queue_size*2) pthread_cond_signal(&rt->worker_ready_cond); pthread_mutex_unlock(&rt->sample_queue_mutex); // do the work#ifndef NO_SSE if (can_use_packets) { // packet ray tracing assert((my_count % 4) == 0); for (int i = 0; i < my_count; i+=4) { rt->camera->makeRayPacket(my_queue + i, rays); rt->raytracePacket(rays, my_colours + i); } } else#endif { // single ray tracing for (int i = 0; i < my_count; i++) { ray = rt->camera->makeRay(my_queue[i]); my_colours[i] = rt->raytrace(ray, 0, NULL); } } // save the results pthread_mutex_lock(&rt->sampler_mutex); for (int i = 0; i < my_count; i++) rt->sampler->saveSample(my_queue[i], my_colours[i]); pthread_mutex_unlock(&rt->sampler_mutex); }}void Raytracer::render(){ if (!sampler || !camera || !top) return; static const int my_count_max = 256; sample_queue_size = my_count_max*2*num_threads; sample_queue = new Sample [sample_queue_size]; sample_queue_count = 0; int my_count, my_pos; int sampnum = 0, sampdone; int phase = 1; bool more_samples; sampler->init(); // create workers dbgmsg(1, "* using %d threads\n", num_threads); pthread_t threads[num_threads]; dbgmsg(1, "* raytracing...\n"); while ( (sampnum = sampler->initSampleSet()) > 0 ) { my_pos = 0; sample_queue_pos = 0; sampdone = 0; end_of_samples = false; for (int t = 0; t < num_threads; t++) { int rc = pthread_create(&threads[t], NULL, raytrace_worker, (void*)this); if (rc) { dbgmsg(0, "\nE pthread_create unsuccessful, return code was %d\n", rc); exit(1); } } dbgmsg(2, "- phase %d: 0%% done", phase); pthread_mutex_lock(&sampler_mutex); for (;;) { my_count = 0; while ( more_samples = sampler->nextSample(&sample_queue[my_pos++]) ) { my_count++; if (my_pos >= sample_queue_size) my_pos = 0; if (my_count >= my_count_max) break; } if (!more_samples && !my_count) break; pthread_mutex_unlock(&sampler_mutex); pthread_mutex_lock(&sample_queue_mutex); sample_queue_count += my_count; // wait for workers if there is enough samples ready on queue while (sample_queue_count > (2*num_threads-1)*my_count_max) pthread_cond_wait(&worker_ready_cond, &sample_queue_mutex); pthread_cond_signal(&sample_queue_cond); pthread_mutex_unlock(&sample_queue_mutex); sampdone += my_count; dbgmsg(2, "\b\b\b\b\b\b\b\b%2d%% done", ((long)sampdone - sample_queue_count)*100/sampnum); pthread_mutex_lock(&sampler_mutex); if (!more_samples) break; } dbgmsg(2, "\b\b\b\b\b\b\b\b100%% done\n"); pthread_mutex_unlock(&sampler_mutex); // wait for workers dbgmsg(2, "- waiting for threads to finish\n"); pthread_mutex_lock(&sample_queue_mutex); end_of_samples = true; while (sample_queue_count) { pthread_cond_broadcast(&sample_queue_cond); pthread_mutex_unlock(&sample_queue_mutex); pthread_mutex_lock(&sample_queue_mutex); } pthread_cond_broadcast(&sample_queue_cond); pthread_mutex_unlock(&sample_queue_mutex); for (int t = 0; t < num_threads; t++) pthread_join(threads[t], NULL); phase ++; } delete[] sample_queue;}void Raytracer::ambientOcclusion(int samples, Float distance, Float angle){ ao_samples = samples; ao_distance = distance; ao_angle = angle; if (ao_distance == 0) /* 0 ==> Inf */ ao_distance = Inf;}