add MSVC compiler support, make it default for Windows pyrit
authorRadek Brich <radek.brich@devl.cz>
Mon, 05 May 2008 15:31:14 +0200 (2008-05-05)
branchpyrit
changeset 92 9af5c039b678
parent 91 9d66d323c354
child 93 96d65f841791
add MSVC compiler support, make it default for Windows new header file simd.h for SSE abstraction and helpers add mselect pseudo instruction for common or(and(...), andnot(...)) replace many SSE intrinsics with new names new MemoryPool class (mempool.h) for faster KdNode allocation remove setMaxDepth() from Octree and KdTree, make max_depth const, it should be defined in constructor and never changed, change after building tree would cause error in traversal modify DefaultSampler to generate nice 2x2 packets of samples for packet tracing optimize Box and BBox::intersect_packet add precomputed invdir attribute to RayPacket scons build system: check for pthread library on Windows check for SDL generate include/config.h with variables detected by scons configuration move auxiliary files to build/ add sanity checks add writable operator[] to Vector
.bzrignore
SConstruct
ccdemos/SConscript
ccdemos/common_ply.h
ccdemos/common_sdl.h
ccdemos/realtime.cc
ccdemos/realtime_bunny.cc
ccdemos/realtime_dragon.cc
ccdemos/spheres_shadow.cc
ccdemos/textures.cc
include/common.h
include/container.h
include/kdtree.h
include/material.h
include/mempool.h
include/octree.h
include/raytracer.h
include/scene.h
include/shapes.h
include/simd.h
include/vector.h
models/SConscript
src/SConscript
src/container.cc
src/kdtree.cc
src/octree.cc
src/pixmap.cc
src/raytracer.cc
src/sampler.cc
src/scene.cc
src/shapes.cc
--- a/.bzrignore	Fri May 02 13:27:47 2008 +0200
+++ b/.bzrignore	Mon May 05 15:31:14 2008 +0200
@@ -1,7 +1,4 @@
 build/*
 docs/*
-.sconsign.dblite
-.sconf_temp
-.optioncache
-config.log
 tools/cpuflags
+include/config.h
--- a/SConstruct	Fri May 02 13:27:47 2008 +0200
+++ b/SConstruct	Mon May 05 15:31:14 2008 +0200
@@ -30,12 +30,22 @@
 """)
 
 import os, sys
-env = Environment() #(ENV = {'PATH' : os.environ['PATH']})
+
+EnsurePythonVersion(2, 3)
+EnsureSConsVersion(0, 97)
+
 Decider('MD5-timestamp')
+SConsignFile('build/.sconsign.dblite')
 
-opt = Options(['.optioncache'])
+if sys.platform == 'win32':
+	tools = ['mingw']
+else:
+	tools = ['default']
+
+env = Environment(tools = tools)
+
+opt = Options(['build/.optioncache'])
 opt.AddOptions(
-	BoolOption('intelc', 'use Intel C++ Compiler, if available', True),
 	BoolOption('simd', 'allow SSE intrinsics', True),
 	('precision', 'floating point number precision (single/double)', "single"),
 	('flags', 'add additional compiler flags', ""),
@@ -44,11 +54,18 @@
 )
 if env['PLATFORM'] == 'win32':
 	opt.AddOptions(
+		BoolOption('msvc', 'use Microsoft Visual C++ Compiler, if available', True),
 		('pythonpath', 'path to Python installation',
 			'C:\\Python%c%c' % (sys.version[0], sys.version[2])),
-		)
+	)
+else:
+	opt.AddOptions(
+		BoolOption('intelc', 'use Intel C++ Compiler, if available', True),
+	)
+
+
 opt.Update(env)
-opt.Save('.optioncache', env)
+opt.Save('build/.optioncache', env)
 Help(opt.GenerateHelpText(env))
 
 
@@ -69,7 +86,7 @@
 
 def CheckIntelC(context):
 	global intelc, intelcversion
-	context.Message('Checking for Intel C++ Compiler... ')
+	context.Message('Checking for IntelC compiler... ')
 	intelc = Tool("intelc").exists(env) == True
 	if intelc:
 		testenv = Environment()
@@ -83,7 +100,7 @@
 
 def CheckGCC(context):
 	global gcc, gccversion
-	context.Message('Checking for GCC... ')
+	context.Message('Checking for GCC compiler... ')
 	gcc = "g++" in env['TOOLS']
 	if gcc:
 		gccversion = env['CCVERSION']
@@ -93,6 +110,19 @@
 		context.Result(False)
 	return gcc
 
+def CheckMSVC(context):
+	global msvc, msvcversion
+	context.Message('Checking for MSVC compiler... ')
+	testenv = Environment()
+	msvc = "msvc" in testenv['TOOLS']
+	if msvc:
+		msvcversion = testenv['MSVS_VERSION']
+		context.Result(msvcversion)
+	else:
+		msvcversion = ''
+		context.Result(False)
+	return msvc
+
 def CheckCPUFlags(context):
 	global cpu, cpuflags_gcc, cpuflags_intelc
 	context.Message('Checking CPU arch and flags... ')
@@ -102,26 +132,51 @@
 	context.Result(cpu)
 	return True
 
-conf = Configure(env,
+conf_dir = "#/build/.sconf_temp"
+log_file="#/build/config.log"
+config_h="#/include/config.h"
+conf = Configure(env, conf_dir=conf_dir, log_file=log_file, config_h=config_h,
 	custom_tests = {
 		'CheckPlatform' : CheckPlatform, 'CheckCPUFlags' : CheckCPUFlags,
-		'CheckIntelC' : CheckIntelC, 'CheckGCC' : CheckGCC})
+		'CheckIntelC' : CheckIntelC, 'CheckGCC' : CheckGCC, 'CheckMSVC' : CheckMSVC})
 conf.CheckPlatform()
+
 conf.CheckGCC()
-conf.CheckIntelC()
-conf.CheckCPUFlags()
+if platform == 'win32':
+	conf.CheckMSVC()
+	intelc = False
+else:
+	conf.CheckIntelC()
+	msvc=False
 
-if intelc and conf.env['intelc']:
-	Tool("intelc").generate(conf.env)
+if intelc or gcc:
+	conf.CheckCPUFlags()
+
+if intelc and (not gcc or conf.env['intelc']):
+	Tool('intelc').generate(conf.env)
 	cc = 'intelc'
+elif msvc and (not gcc or conf.env['msvc']):
+	Tool('default').generate(conf.env)
+	conf.Define("MSVC")
+	cc = 'msvc'
 elif gcc:
 	cc = 'gcc'
+else:
+	cc = 'none'
+
+if platform == 'win32' and cc == 'gcc':
+	conf.env.Append(LIBPATH=["C:/mingw/lib", "C:/msys/mingw/lib"])
+	conf.env.Append(CPPPATH=["C:/mingw/include", "C:/msys/mingw/include"])
 
 add_flags = ''
 if cc == 'gcc':
 	add_flags += cpuflags_gcc + ' -ffast-math '
 if cc == 'intelc':
 	add_flags += cpuflags_intelc + ' '
+if cc == 'msvc':
+	add_flags += '/fp:fast '
+	if conf.env['simd']:
+		add_flags += '/arch:SSE '
 
 if conf.env['force_flags']:
 	add_flags = conf.env['flags'] + ' '
@@ -129,18 +184,19 @@
 	add_flags += conf.env['flags'] + ' '
 
 if conf.env['precision'] == 'double':
-	add_flags += '-DPYRIT_DOUBLE '
+	conf.Define("PYRIT_DOUBLE")
 elif cc == 'gcc':
 	add_flags += '-fsingle-precision-constant '
 
-if not conf.env['simd']:
-	add_flags += '-DNO_SSE '
+if not conf.env['simd'] or conf.env['precision'] == 'double':
+	conf.Define("NO_SIMD")
 
 if cc == 'intelc':
 	conf.env.Append(CCFLAGS="-O3 -w1 " + add_flags)
 elif cc == 'gcc':
 	conf.env.Append(CCFLAGS="-O3 -Wall -pipe " + add_flags)
-	# CCFLAGS= -fno-strict-aliasing
+elif cc == 'msvc':
+	conf.env.Append(CCFLAGS="/Ox /Ob2 /GS- /Gy /GF /GR- /Zp16 /MD /EHsc /vmb " + add_flags)
 else:
 	print "No supported compiler found."
 	Exit(1)
@@ -148,47 +204,79 @@
 print "Using compiler: " + cc
 print "Additional flags: " + add_flags
 
-if conf.CheckLibWithHeader('png', 'png.h', 'C'):
-	conf.env.Append(CCFLAGS='-DHAVE_PNG')
-	conf.env.Append(LIBS=['png'])
+pthread = True
+if conf.env['PLATFORM'] == 'win32':
+	if cc == 'msvc':
+		if not conf.CheckLib('pthreadVC2'):
+			pthread = False
+		conf.env.Append(LIBS=["pthreadVC2"])
+	elif cc == 'gcc':
+		if not conf.CheckLib('pthreadGC2'):
+			pthread = False
+		conf.env.Append(LIBS=["pthreadGC2"])
+else:
+	conf.env.Append(CCFLAGS="-pthread ")
 
-if not conf.CheckCHeader('pthread.h'):
+if not pthread:
 	print 'Error: Cannot build without pthread.'
 	Exit(1)
 
-if conf.env['PLATFORM'] == 'win32':
-	conf.env.Append(LIBS=["pthreadGC2"])
-else:
-	conf.env.Append(CCFLAGS="-pthread ")
+if conf.CheckLibWithHeader('png', 'png.h', 'C'):
+	conf.Define('HAVE_PNG')
+	conf.env.Append(LIBS=['png'])
+elif conf.CheckLib('libpng13'):
+	conf.Define('HAVE_PNG')
+	conf.env.Append(LIBS=['libpng13'])
 
 if conf.env['profile'] and cc == 'gcc':
 	conf.env.Append(CCFLAGS="-pg", LINKFLAGS="-pg")
 
 env = conf.Finish()
 
+# configure SDL
+sdlenv = env.Clone()
+if cc != 'msvc':
+	try:
+		sdlenv.ParseConfig('sh sdl-config --cflags')
+		sdlenv.ParseConfig('sh sdl-config --libs')
+	except:
+		pass
+else:
+	sdlenv.Append(LIBS=['SDL', 'SDLmain'])
+
+conf = Configure(sdlenv, conf_dir=conf_dir, log_file=log_file, config_h=config_h)
+have_sdl = False
+if conf.CheckLib('SDL'):
+	have_sdl = True
+else:
+	print "SDL not found, some demos will not built."
+sdlenv = conf.Finish()
+
+if cc == 'msvc':
+	sdlenv.Append(LINKFLAGS="/SUBSYSTEM:WINDOWS")
 
 ### build targets
 
-lib = SConscript('src/SConscript', build_dir='build/lib', duplicate=0, exports={'env':env,'buildmodule':False})
-pymodule = SConscript('src/SConscript', build_dir='build/pymodule', duplicate=0, exports={'env':env,'buildmodule':True})
+Export('env sdlenv cc')
+lib = SConscript('src/SConscript', build_dir='build/lib', duplicate=0,
+	exports={'buildmodule':False})
+pymodule = SConscript('src/SConscript', build_dir='build/pymodule', duplicate=0,
+	exports={'buildmodule':True})
 
-SConscript('ccdemos/SConscript', build_dir='build/ccdemos', duplicate=0, exports='env lib')
+if have_sdl:
+	SConscript('ccdemos/SConscript', build_dir='build/ccdemos', duplicate=0,
+		exports='lib')
+
 SConscript('demos/SConscript', exports='pymodule')
-env.Alias('demos', ['cc-demos', 'python-demos'])
-
-SConscript('tests/SConscript', build_dir='build/tests', duplicate=0, exports='env lib')
-
+SConscript('tests/SConscript', build_dir='build/tests', duplicate=0, exports='lib')
 SConscript('models/SConscript')
 
+env.Alias('demos', ['cc-demos', 'python-demos'])
 env.Alias('libs', ['static-lib', 'python-module'])
-
 env.Alias('docs', Command('docs/html', [], 'doxygen'))
 env.Clean('docs', ['docs/html'])
-
 env.Alias('no-docs', ['libs', 'demos', 'models'])
 env.Alias('no-download', ['libs', 'demos', 'local-models'])
-
 env.Alias('all', ['no-docs', 'docs'])
-
 env.Alias('pyrit', 'no-download')
 Default('pyrit')
--- a/ccdemos/SConscript	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/SConscript	Mon May 05 15:31:14 2008 +0200
@@ -1,21 +1,13 @@
-Import('env lib')
-myenv = env.Clone()
+Import('env sdlenv lib')
+myenv = sdlenv.Clone()
 myenv.Append(CPPPATH = ['.','#include'], LIBPATH='#build/lib')
 myenv.Prepend(LIBS=['pyrit'])
 
-sdlenv = myenv.Clone()
-try:
-	sdlenv.ParseConfig('sh sdl-config --cflags --libs')
-except:
-	print "SDL not found, some demos will not built."
-	myenv.Alias('cc-demos', None)
-	Return()
+l = []
+l.append( myenv.Program(['realtime.cc']) )
+l.append( myenv.Program(['realtime_bunny.cc']) )
+l.append( myenv.Program(['realtime_dragon.cc']) )
+l.append( myenv.Program(['spheres_shadow.cc']) )
+l.append( myenv.Program(['textures.cc']) )
 
-l = []
-l.append( sdlenv.Program(['realtime.cc']) )
-l.append( sdlenv.Program(['realtime_bunny.cc']) )
-l.append( sdlenv.Program(['realtime_dragon.cc']) )
-l.append( sdlenv.Program(['spheres_shadow.cc']) )
-l.append( sdlenv.Program(['textures.cc']) )
-
-myenv.Alias('cc-demos', l)
+env.Alias('cc-demos', l)
--- a/ccdemos/common_ply.h	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/common_ply.h	Mon May 05 15:31:14 2008 +0200
@@ -2,11 +2,13 @@
 #include <fstream>
 #include <iomanip>
 
-void load_ply(Raytracer &rt, const char *filename, Material *mat, Vector scale, Vector transp)
+void load_ply(Raytracer &rt, const char *filename, Material *mat, const Vector &scale, const Vector &transp)
 {
+	MemoryPool<Triangle> *mp_tri;
+	MemoryPool<NormalVertex> *mp_vert;
 	vector<NormalVertex*> vertices;
-	vector<Vector> normals;
 	vector<int> vertex_face_num;
+	Vector *normals;
 	ifstream f(filename);
 	string token = "a";
 	if (!f.is_open())
@@ -33,14 +35,15 @@
 	// read vertices
 	Vector P;
 	int num = vertex_num;
+	mp_vert = new MemoryPool<NormalVertex>(vertex_num);
+	normals = new Vector[vertex_num];
 	while (num--)
 	{
 		f >> P.x >> P.y >> P.z;
 		P.x = scale.x*P.x + transp.x;
 		P.y = scale.y*P.y + transp.y;
 		P.z = scale.z*P.z + transp.z;
-		vertices.push_back(new NormalVertex(P));
-		normals.push_back(Vector());
+		vertices.push_back(new (mp_vert->alloc()) NormalVertex(P));
 		vertex_face_num.push_back(0);
 		f.ignore(1000,'\n');
 	}
@@ -48,6 +51,7 @@
 	// read faces
 	Triangle *face;
 	int v1, v2, v3;
+	mp_tri = new MemoryPool<Triangle>(face_num);
 	while (face_num--)
 	{
 		f >> num;
@@ -67,14 +71,14 @@
 			continue;
 		}
 
-		face = new Triangle(vertices[v1], vertices[v3], vertices[v2], mat);
+		face = new (mp_tri->alloc()) Triangle(vertices[v1], vertices[v3], vertices[v2], mat);
 		rt.addShape(face);
 
-		normals.at(v1) += face->getNormal();
+		normals[v1] += face->getNormal();
 		vertex_face_num.at(v1)++;
-		normals.at(v2) += face->getNormal();
+		normals[v2] += face->getNormal();
 		vertex_face_num.at(v2)++;
-		normals.at(v3) += face->getNormal();
+		normals[v3] += face->getNormal();
 		vertex_face_num.at(v3)++;
 		f.ignore(1000,'\n');
 	}
@@ -82,10 +86,11 @@
 	for (int i = 0; i < vertex_num; i++)
 		if (vertex_face_num.at(i))
 		{
-			normals.at(i) /= vertex_face_num.at(i);
-			normals.at(i).normalize();
-			vertices.at(i)->N = normals.at(i);
+			normals[i] /= vertex_face_num.at(i);
+			normals[i].normalize();
+			vertices.at(i)->N = normals[i];
 		}
 
+	delete[] normals;
 	f.close();
 }
--- a/ccdemos/common_sdl.h	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/common_sdl.h	Mon May 05 15:31:14 2008 +0200
@@ -30,7 +30,6 @@
 	Uint32 *bufp = (Uint32 *)screen->pixels;
 	for (Float *fd = render_buffer; fd != render_buffer + w*h*3; fd += 3)
 	{
-#ifdef NO_SSE
 		unsigned char c[3];
 		for (int i = 0; i < 3; i++)
 		{
@@ -40,11 +39,6 @@
 				c[i] = (unsigned char)(fd[i] * 255.0);
 		}
 		*bufp = SDL_MapRGB(screen->format, c[0], c[1], c[2]);
-#else
-		__m64 m = _mm_cvtps_pi16(_mm_mul_ps(_mm_set_ps1(255.0),
-			_mm_min_ps(mOne, _mm_set_ps(0, fd[2],fd[1],fd[0]))));
-		*bufp = SDL_MapRGB(screen->format, ((char*)&m)[0], ((char*)&m)[2], ((char*)&m)[4]);
-#endif
 		bufp++;
 	}
 
@@ -108,27 +102,27 @@
 						break;
 					}
 					if (event.key.keysym.sym == SDLK_DOWN) {
-						rotx = +0.01;
+						rotx = +0.01f;
 						break;
 					}
 					if (event.key.keysym.sym == SDLK_UP) {
-						rotx = -0.01;
+						rotx = -0.01f;
 						break;
 					}
 					if (event.key.keysym.sym == SDLK_w) {
-						move = +0.5;
+						move = +0.5f;
 						break;
 					}
 					if (event.key.keysym.sym == SDLK_s) {
-						move = -0.5;
+						move = -0.5f;
 						break;
 					}
 					if (event.key.keysym.sym == SDLK_c) {
 						// print camera coordinates
-						cout << "Camera: eye=" << cam.eye
-						<< ", p=" << cam.p
-						<< ", u=" << cam.u
-						<< ", v=" << cam.v
+						cout << "Camera: eye=" << cam.getEye()
+						<< ", p=" << cam.getp()
+						<< ", u=" << cam.getu()
+						<< ", v=" << cam.getv()
 						<< endl;
 						break;
 					}
@@ -156,15 +150,17 @@
 			}
 		}
 		cam.rotate(Quaternion(cos(roty),0,sin(roty),0).normalize());
-		cam.rotate(Quaternion(cos(rotx),cam.u[0]*sin(rotx),0,cam.u[2]*sin(rotx)).normalize());
-		cam.u.y = 0;
-		cam.u.normalize();
+		cam.rotate(Quaternion(cos(rotx),
+			cam.getu()[0]*sin(rotx),0,cam.getu()[2]*sin(rotx)).normalize());
+		//cam.u.y = 0;
+		//cam.u.normalize();
 		if (move != 0.0)
 			cam.move(move,0,0);
 		if (update_callback != NULL)
 			update_callback(render_buffer);
 		update(rt, screen, render_buffer);
 	}
+
 	free(render_buffer);
 
 	Uint32 fp100s_aver = fp10s_acc*10/fp10s_acc_samples;
--- a/ccdemos/realtime.cc	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/realtime.cc	Mon May 05 15:31:14 2008 +0200
@@ -14,18 +14,18 @@
 	rt.setMaxDepth(3);
 	rt.setTop(&top);
 
-	Light light1(Vector(2.0, -5.0, -5.0), Colour(0.7, 0.3, 0.6));
+	Light light1(Vector(2.0f, -5.0f, -5.0f), Colour(0.7f, 0.3f, 0.6f));
 	light1.castShadows(false);
 	rt.addLight(&light1);
 
-	Light light2(Vector(-2.0, 10.0, 2.0), Colour(0.4, 0.6, 0.3));
+	Light light2(Vector(-2.0f, 10.0f, 2.0f), Colour(0.4f, 0.6f, 0.3f));
 	light2.castShadows(false);
 	rt.addLight(&light2);
 
-	Material mat_sph(Colour(1.0, 1.0, 1.0));
+	Material mat_sph(Colour(1.0f, 1.0f, 1.0f));
 	for (int y=0; y<10; y++)
 		for (int x=0; x<10; x++)
-			rt.addShape(new Sphere(Vector(x*2-10, (Float)rand()/RAND_MAX*5.0, y*2-10), 0.45, &mat_sph));
+			rt.addShape(new Sphere(Vector(x*2-10, (Float)rand()/RAND_MAX*5.0f, y*2-10), 0.45f, &mat_sph));
 
 	rt.setCamera(&cam);
 	cam.setEye(Vector(0,0,10));
@@ -33,4 +33,6 @@
 	top.optimize();
 
 	loop_sdl(rt, cam);
+
+	return 0;
 }
--- a/ccdemos/realtime_bunny.cc	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/realtime_bunny.cc	Mon May 05 15:31:14 2008 +0200
@@ -31,4 +31,6 @@
 	top.optimize();
 
 	loop_sdl(rt, cam);
+	
+	return 0;
 }
--- a/ccdemos/realtime_dragon.cc	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/realtime_dragon.cc	Mon May 05 15:31:14 2008 +0200
@@ -55,4 +55,6 @@
 	}
 
 	loop_sdl(rt, cam);
+	
+	return 0;
 }
--- a/ccdemos/spheres_shadow.cc	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/spheres_shadow.cc	Mon May 05 15:31:14 2008 +0200
@@ -4,20 +4,20 @@
 #include "common_sdl.h"
 
 Camera cam;
-Light light(Vector(-2.0, 10.0, -2.0), Colour(0.9, 0.9, 0.9));
+Light light(Vector(-2.0f, 10.0f, -2.0f), Colour(0.9f, 0.9f, 0.9f));
 
 Float lx, ly, lz, cf;
 
 void update_callback(Float*)
 {
-	if (lx != 0.0)
+	if (lx != 0.0f)
 		light.pos.x += lx;
-	if (ly != 0.0)
+	if (ly != 0.0f)
 		light.pos.y += ly;
-	if (lz != 0.0)
+	if (lz != 0.0f)
 		light.pos.z += lz;
-	if (cf != 0.0)
-		cam.F += cf;
+	if (cf != 0.0f)
+		cam.setF(cam.getF() + cf);
 }
 
 void key_callback(int key, int down)
@@ -25,29 +25,29 @@
 	switch (key)
 	{
 		case SDLK_r:
-			lx = -0.1 * down;
+			lx = -0.1f * down;
 			break;
 		case SDLK_t:
-			lx = +0.1 * down;
+			lx = +0.1f * down;
 			break;
 		case SDLK_f:
-			ly = -0.1 * down;
+			ly = -0.1f * down;
 			break;
 		case SDLK_g:
-			ly = +0.1 * down;
+			ly = +0.1f * down;
 			break;
 		case SDLK_v:
-			lz = -0.1 * down;
+			lz = -0.1f * down;
 			break;
 		case SDLK_b:
-			lz = +0.1 * down;
+			lz = +0.1f * down;
 			break;
 
 		case SDLK_z:
-			cf = -0.02 * down;
+			cf = -0.02f * down;
 			break;
 		case SDLK_x:
-			cf = +0.02 * down;
+			cf = +0.02f * down;
 			break;
 	}
 }
@@ -91,10 +91,10 @@
 
 	top.optimize();
 
-	cam.setEye(Vector(-2.28908, 4.30992, 12.3051));
-	cam.p = Vector(0.0988566, -0.139543, -0.985269);
-	cam.u = Vector(-0.995004, 0, -0.0998334);
-	cam.v = Vector(0.0139311, 0.990216, -0.138846);
+	cam.setEye(Vector(-2.28908f, 4.30992f, 12.3051f));
+	cam.setp(Vector(0.0988566f, -0.139543f, -0.985269f));
+	cam.setu(Vector(-0.995004f, 0.0f, -0.0998334f));
+	cam.setv(Vector(0.0139311f, 0.990216f, -0.138846f));
 	rt.setCamera(&cam);
 
 	w = 800;
@@ -113,4 +113,6 @@
 		rt.render();
 		sampler.getPixmap().writePNG("spheres_shadow.png");
 	}
+
+	return 0;
 }
--- a/ccdemos/textures.cc	Fri May 02 13:27:47 2008 +0200
+++ b/ccdemos/textures.cc	Mon May 05 15:31:14 2008 +0200
@@ -17,7 +17,7 @@
 	if (lz != 0.0)
 		light.pos.z += lz;
 	if (cf != 0.0)
-		cam.F += cf;
+		cam.setF(cam.getF() + cf);
 }
 
 void key_callback(int key, int down)
@@ -148,4 +148,6 @@
 		rt.render();
 		sampler.getPixmap().writePNG("textures.png");
 	}
+	
+	return 0;
 }
--- a/include/common.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/common.h	Mon May 05 15:31:14 2008 +0200
@@ -27,8 +27,11 @@
 #ifndef COMMON_H
 #define COMMON_H
 
+#include "config.h"
+
 #include <stdio.h>
 #include <stdarg.h>
+#include <math.h>
 #include <float.h>
 #include <pthread.h>
 #include <string>
@@ -41,30 +44,12 @@
 # define Inf DBL_MAX
 #else
 # define Float float
-# define Eps 1e-6
+# define Eps 1e-6f
 # define Inf FLT_MAX
 #endif
 
-#ifndef NO_SSE
-// SSE intrinsics
-#include <xmmintrin.h>
-
-const __m128 mZero = _mm_set_ps1(0.);
-const __m128 mOne = _mm_set_ps1(1.);
-const __m128 mTwo = _mm_set_ps1(2.);
-const __m128 mEps = _mm_set_ps1(Eps);
-const __m128 mMEps = _mm_set_ps1(-Eps);
-const __m128 mInf = _mm_set_ps1(Inf);
-const __m128 mAllSet = _mm_cmplt_ps(mZero, mOne);
-
-inline const __m128 _mm_fastpow(const __m128& base, const __m128& exponent)
-{
-    __m128 denom = _mm_mul_ps( exponent, base);
-    denom = _mm_sub_ps( exponent, denom);
-    denom = _mm_add_ps( base, denom);
-    return _mm_mul_ps( base, _mm_rcp_ps(denom));
-}
-#endif
+// enable M_* constants in MSVC
+#define _USE_MATH_DEFINES
 
 /* verbosity level:
 0: only errors (E)
--- a/include/container.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/container.h	Mon May 05 15:31:14 2008 +0200
@@ -42,8 +42,10 @@
 	BBox bbox;
 public:
 	ShapeList shapes;
+
 	Container(): bbox(), shapes() {};
 	virtual ~Container() {};
+
 	virtual void addShape(Shape* aShape);
 	//void addShapeNoExtend(Shape* aShape) { shapes.push_back(aShape); };
 	virtual Shape *nearest_intersection(const Shape *origin_shape, const Ray &ray,
@@ -55,9 +57,9 @@
 
 	virtual ostream & dump(ostream &st);
 
-#ifndef NO_SSE
-	virtual void packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
-		Float *nearest_distances, Shape **nearest_shapes);
+#ifndef NO_SIMD
+	virtual void packet_intersection(const Shape* const* origin_shapes, const RayPacket &rays,
+		Float *nearest_distances, Shape** nearest_shapes);
 #endif
 };
 
--- a/include/kdtree.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/kdtree.h	Mon May 05 15:31:14 2008 +0200
@@ -31,9 +31,11 @@
 #include <fstream>
 #include <assert.h>
 
-#include "container.h"
+#include "common.h"
 #include "vector.h"
 #include "scene.h"
+#include "container.h"
+#include "mempool.h"
 
 using namespace std;
 
@@ -42,7 +44,10 @@
  */
 class KdNode
 {
-	Float split;
+	union {
+		Float split;
+		void *pad64;  /* pad to 64 bits on 64bit platforms */
+	};
 	union {
 		KdNode *children;
 		ShapeList *shapes;
@@ -62,10 +67,10 @@
 	const Float& getSplit() const { return split; };
 
 	void setChildren(KdNode *node) { children = node; assert((flags & 3) == 0); };
-	KdNode* getLeftChild() const { return (KdNode*)((off_t)children & ~3); };
-	KdNode* getRightChild() const { return (KdNode*)((off_t)children & ~3) + 1; };
+	KdNode* getLeftChild() const { return (KdNode*)((size_t)children & ~3); };
+	KdNode* getRightChild() const { return (KdNode*)(((size_t)children & ~3) + 16); };
 
-	ShapeList* getShapes() const { return (ShapeList*)((off_t)shapes & ~3); };
+	ShapeList* getShapes() const { return (ShapeList*)((size_t)shapes & ~3); };
 	void addShape(Shape* aShape) { getShapes()->push_back(aShape); };
 };
 
@@ -76,25 +81,26 @@
 {
 	KdNode *root;
 	bool built;
-	int max_depth;
+	const int max_depth;
+	MemoryPool<KdNode> mempool;
 
-	void recursive_build(KdNode *node, BBox bbox, int maxdepth);
+	void recursive_build(KdNode *node, const BBox &bbox, int maxdepth);
 	void recursive_load(istream &st, KdNode *node);
 public:
-	KdTree() : Container(), root(NULL), built(false), max_depth(32) {};
+	KdTree(): Container(), root(NULL), built(false), max_depth(32), mempool(64) {};
+	KdTree(int maxdepth): Container(), root(NULL), built(false), max_depth(maxdepth), mempool(64) {};
 	~KdTree() { if (root) delete root; };
 	void addShape(Shape* aShape) { Container::addShape(aShape); built = false; };
 	Shape *nearest_intersection(const Shape *origin_shape, const Ray &ray,
 		Float &nearest_distance);
-#ifndef NO_SSE
-	void packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
+#ifndef NO_SIMD
+	void packet_intersection(const Shape* const* origin_shapes, const RayPacket &rays,
 		Float *nearest_distances, Shape **nearest_shapes);
 #endif
 	void optimize() { build(); };
 	void build();
 	bool isBuilt() const { return built; };
 	KdNode *getRootNode() const { return root; };
-	void setMaxDepth(int md) { max_depth = md; };
 
 	ostream & dump(ostream &st);
 	istream & load(istream &st, Material *mat);
--- a/include/material.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/material.h	Mon May 05 15:31:14 2008 +0200
@@ -107,7 +107,7 @@
 	Float invsize;
 public:
 	TextureMap(const Vector &acenter, const Float &size):
-		center(acenter), invsize(1./size) {};
+		center(acenter), invsize(1.0f/size) {};
 	virtual ~TextureMap() {};
 	virtual void map(const Vector &point, Float &u, Float &v) = 0;
 };
@@ -238,9 +238,9 @@
 	{
 		Float u,v;
 		map->map(point, u,v);
-		u = u - 0.5;
+		u = u - 0.5f;
 		u -= floor(u);
-		v = -(v - 0.5);
+		v = -(v - 0.5f);
 		v -= floor(v);
 		return pixmap->get((int)(u*pixmap->getWidth()), (int)(v*pixmap->getHeight()));
 	};
@@ -259,7 +259,7 @@
 	{
 		Float u,v, val;
 		map->map(point, u,v);
-		val = 0.5*(round(u - floor(u)) + round(v - floor(v)));
+		val = 0.5f*(floor(0.5f + u - floor(u)) + floor(0.5f + v - floor(v)));
 		return colourmap->map(val);
 	};
 };
@@ -299,13 +299,13 @@
 
 	Material(const Colour &acolour): colour(acolour), texture(NULL), smooth(false)
 	{
-		ambient = 0.2;
-		diffuse = 0.8;
-		specular = 0.2;
-		shininess = 0.5;
-		reflectivity = 0.2;
-		transmissivity = 0.0;
-		refract_index = 1.3;
+		ambient = 0.2f;
+		diffuse = 0.8f;
+		specular = 0.2f;
+		shininess = 0.5f;
+		reflectivity = 0.2f;
+		transmissivity = 0.0f;
+		refract_index = 1.3f;
 	}
 
 	void setPhong(const Float amb, const Float dif, const Float spec, const Float shin)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/mempool.h	Mon May 05 15:31:14 2008 +0200
@@ -0,0 +1,78 @@
+/*
+ * mempool.h: memory pool
+ *
+ * This file is part of Pyrit Ray Tracer.
+ *
+ * Copyright 2008  Radek Brich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef MEMPOOL_H
+#define MEMPOOL_H
+
+#include "common.h"
+
+template <typename Type>
+class MemoryPool
+{
+	void *mem;
+	size_t cur, size, typesize, align;
+
+	void init()
+	{
+		typesize = ((sizeof(Type)-1)/align+1)*align;
+#ifndef NO_SIMD
+		mem = (Type *)_mm_malloc(size * typesize, align);
+#else
+		mem = (Type *)malloc(inisize * typesize);
+#endif
+	};
+public:
+	MemoryPool(const size_t inisize):
+		cur(0), size(inisize), align(16) { init(); };
+	MemoryPool(const size_t inisize, const size_t inialign):
+		cur(0), size(inisize), align(inialign) { init(); };
+	~MemoryPool()
+	{
+#ifndef NO_SIMD
+		_mm_free(mem);
+#else
+		free(mem);
+#endif
+	};
+
+	void *alloc()
+	{
+		if (cur == size)
+		{
+#ifndef NO_SIMD
+			void *newmem = _mm_malloc(2 * size * typesize, align);
+#else
+			void *newmem = malloc(2 * size * typesize);
+#endif
+			memcpy(newmem, mem, size * typesize);
+			mem = newmem;
+			size *= 2;
+		}
+		return (void*)((size_t)mem + typesize*(cur++));
+	};
+};
+
+#endif
--- a/include/octree.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/octree.h	Mon May 05 15:31:14 2008 +0200
@@ -61,10 +61,10 @@
 	OctreeNode *getChild(const int num) { assert(!isLeaf()); return children + num; };
 
 	void addShape(Shape* aShape) { getShapes()->push_back(aShape); };
-	ShapeList *getShapes() { return (ShapeList*)((off_t)shapes & ~(off_t)1); };
+	ShapeList *getShapes() { return (ShapeList*)((size_t)shapes & ~(size_t)1); };
 	void setShapes(ShapeList *const ashapes) { shapes = ashapes; assert(!isLeaf()); setLeaf(); };
 
-	void subdivide(BBox bbox, int maxdepth);
+	void subdivide(const BBox &bbox, int maxdepth);
 };
 
 /**
@@ -74,9 +74,10 @@
 {
 	OctreeNode *root;
 	bool built;
-	int max_depth;
+	const int max_depth;
 public:
 	Octree() : Container(), root(NULL), built(false), max_depth(10) {};
+	Octree(int maxdepth) : Container(), root(NULL), built(false), max_depth(maxdepth) {};
 	~Octree() { if (root) delete root; };
 	void addShape(Shape* aShape) { Container::addShape(aShape); built = false; };
 	Shape *nearest_intersection(const Shape *origin_shape, const Ray &ray,
@@ -85,7 +86,6 @@
 	void build();
 	void save(ostream &str, OctreeNode *node = NULL) {};
 	void load(istream &str, OctreeNode *node = NULL) {};
-	void setMaxDepth(int md) { max_depth = md; };
 };
 
 #endif
--- a/include/raytracer.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/raytracer.h	Mon May 05 15:31:14 2008 +0200
@@ -70,8 +70,8 @@
 		const Vector &P, const Vector &N, const Vector &V);
 	void lightScatter(const Ray &ray, const Shape *shape, int depth,
 		const Vector &P, const Vector &normal, bool from_inside, Colour &col);
-#ifndef NO_SSE
-	VectorPacket PhongShader_packet(const Shape **shapes,
+#ifndef NO_SIMD
+	VectorPacket PhongShader_packet(const Shape* const* shapes,
 		const VectorPacket &P, const VectorPacket &N, const VectorPacket &V);
 	void raytracePacket(RayPacket &rays, Colour *results);
 #endif
--- a/include/scene.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/scene.h	Mon May 05 15:31:14 2008 +0200
@@ -42,23 +42,24 @@
 {
 public:
 	Vector o, dir;
+
 	Ray(): o(), dir() {};
 	Ray(const Vector &ao, const Vector &adir):
 		o(ao), dir(adir) {};
 };
 
-#ifndef NO_SSE
+#ifndef NO_SIMD
 /**
  * packet of 4 rays
  */
 class RayPacket
 {
 public:
-	VectorPacket o, dir;
+	VectorPacket o, dir, invdir;
 
 	RayPacket(): o(), dir() {};
 	RayPacket(const VectorPacket &ao, const VectorPacket &adir):
-		o(ao), dir(adir) {};
+		o(ao), dir(adir), invdir(mOne/adir) {};
 
 	// index operator - get a ray
 	Ray operator[](int i) const
@@ -75,10 +76,9 @@
  */
 class Camera
 {
-public:
 	Vector eye, p, u, v;
 	Float F;
-
+public:
 	Camera(): eye(0,0,10), p(0,0,-1), u(-1,0,0), v(0,1,0), F(2.*tan(M_PI/8.)) {};
 	Camera(const Vector &C, const Vector &ap, const Vector &au, const Vector &av):
 		eye(C), p(ap), u(au), v(av), F(2.*tan(M_PI/8.)) {};
@@ -89,68 +89,78 @@
 		p.normalize(); u.normalize();
 		v = cross(p, u);
 	};
+
+	const Vector &getEye() const { return eye; };
+	const Vector &getp() const { return p; };
+	const Vector &getu() const { return u; };
+	const Vector &getv() const { return v; };
+	const Float &getF() const { return F; };
 	void setEye(const Vector &aeye) { eye = aeye; };
-	void setAngle(const Float angle) { F = 2.*tan(angle/2.); };
+	void setp(const Vector &ap) { p = ap; };
+	void setu(const Vector &au) { u = au; };
+	void setv(const Vector &av) { v = av; };
+	void setF(const Float &aF) { F = aF; };
+	void setAngle(const Float angle) { F = 2.0f*tan(angle/2.0f); };
 	void rotate(const Quaternion &q);
 	void move(const Float fw, const Float left, const Float up);
 
-	Ray makeRay(Sample &samp)
+	const Ray makeRay(const Sample &samp) const
 	{
-		Vector dir = p - (u*samp.x + v*samp.y)*F;
-		dir.normalize();
+		Vector dir = normalize(p - (u*samp.x + v*samp.y)*F);
 		return Ray(eye, dir);
 	};
 
-#ifndef NO_SSE
-	void makeRayPacket(Sample *samples, RayPacket &rays)
+#ifndef NO_SIMD
+	void makeRayPacket(const Sample *samples, RayPacket &rays) const
 	{
-		__m128 m1x,m1y,m1z;
-		__m128 m2x,m2y,m2z;
-		__m128 m;
+		mfloat4 m1x,m1y,m1z;
+		mfloat4 m2x,m2y,m2z;
+		mfloat4 m;
 
 		// m1(xyz) = u * samples[i].x
-		m1x = _mm_set_ps1(u.x);
-		m1y = _mm_set_ps1(u.y);
-		m1z = _mm_set_ps1(u.z);
-		m = _mm_set_ps(samples[3].x, samples[2].x, samples[1].x, samples[0].x);
-		m1x = _mm_mul_ps(m1x, m);
-		m1y = _mm_mul_ps(m1y, m);
-		m1z = _mm_mul_ps(m1z, m);
+		m1x = mset1(u.x);
+		m1y = mset1(u.y);
+		m1z = mset1(u.z);
+		m = mset(samples[3].x, samples[2].x, samples[1].x, samples[0].x);
+		m1x = mmul(m1x, m);
+		m1y = mmul(m1y, m);
+		m1z = mmul(m1z, m);
 
 		// m2(xyz) = v * samples[i].y
-		m2x = _mm_set_ps1(v.x);
-		m2y = _mm_set_ps1(v.y);
-		m2z = _mm_set_ps1(v.z);
-		m = _mm_set_ps(samples[3].y, samples[2].y, samples[1].y, samples[0].y);
-		m2x = _mm_mul_ps(m2x, m);
-		m2y = _mm_mul_ps(m2y, m);
-		m2z = _mm_mul_ps(m2z, m);
+		m2x = mset1(v.x);
+		m2y = mset1(v.y);
+		m2z = mset1(v.z);
+		m = mset(samples[3].y, samples[2].y, samples[1].y, samples[0].y);
+		m2x = mmul(m2x, m);
+		m2y = mmul(m2y, m);
+		m2z = mmul(m2z, m);
 
 		// m1(xyz) = (m1 + m2) = (u*samples[i].x + v*samples[i].y)
-		m1x = _mm_add_ps(m1x, m2x);
-		m1y = _mm_add_ps(m1y, m2y);
-		m1z = _mm_add_ps(m1z, m2z);
+		m1x = madd(m1x, m2x);
+		m1y = madd(m1y, m2y);
+		m1z = madd(m1z, m2z);
 
 		// m1(xyz) = m1*F = (u*samples[i].x + v*samples[i].y)*F
-		m = _mm_set_ps1(F);
-		m1x = _mm_mul_ps(m1x, m);
-		m1y = _mm_mul_ps(m1y, m);
-		m1z = _mm_mul_ps(m1z, m);
+		m = mset1(F);
+		m1x = mmul(m1x, m);
+		m1y = mmul(m1y, m);
+		m1z = mmul(m1z, m);
 
 		// m1(xyz) = p - m1 = p - (u*samples[i].x + v*samples[i].y)*F = dir
-		m2x = _mm_set_ps1(p.x);
-		m2y = _mm_set_ps1(p.y);
-		m2z = _mm_set_ps1(p.z);
-		rays.dir.mx = _mm_sub_ps(m2x, m1x);
-		rays.dir.my = _mm_sub_ps(m2y, m1y);
-		rays.dir.mz = _mm_sub_ps(m2z, m1z);
+		m2x = mset1(p.x);
+		m2y = mset1(p.y);
+		m2z = mset1(p.z);
+		rays.dir.mx = msub(m2x, m1x);
+		rays.dir.my = msub(m2y, m1y);
+		rays.dir.mz = msub(m2z, m1z);
 
 		// copy origin
-		rays.o.mx = _mm_set_ps1(eye.x);
-		rays.o.my = _mm_set_ps1(eye.y);
-		rays.o.mz = _mm_set_ps1(eye.z);
+		rays.o.mx = mset1(eye.x);
+		rays.o.my = mset1(eye.y);
+		rays.o.mz = mset1(eye.z);
 
 		rays.dir.normalize();
+		rays.invdir = mOne/rays.dir;
 	};
 #endif
 };
@@ -180,14 +190,16 @@
 public:
 	Vector L;
 	Vector H;
+
 	BBox(): L(), H() {};
-	BBox(const Vector aL, const Vector aH): L(aL), H(aH) {};
-	Float w() { return H.x-L.x; };
-	Float h() { return H.y-L.y; };
-	Float d() { return H.z-L.z; };
-	bool intersect(const Ray &ray, Float &a, Float &b);
-#ifndef NO_SSE
-	__m128 intersect_packet(const RayPacket &rays, __m128 &a, __m128 &b);
+	BBox(const Vector &aL, const Vector &aH): L(aL), H(aH) {};
+
+	Float w() const { return H.x-L.x; };
+	Float h() const { return H.y-L.y; };
+	Float d() const { return H.z-L.z; };
+	bool intersect(const Ray &ray, Float &a, Float &b) const;
+#ifndef NO_SIMD
+	mfloat4 intersect_packet(const RayPacket &rays, mfloat4 &a, mfloat4 &b) const;
 #endif
 };
 
--- a/include/shapes.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/shapes.h	Mon May 05 15:31:14 2008 +0200
@@ -55,10 +55,10 @@
 	// first intersection point
 	virtual bool intersect(const Ray &ray, Float &dist) const = 0;
 
-#ifndef NO_SSE
-	virtual __m128 intersect_packet(const RayPacket &rays, __m128 &dists)
+#ifndef NO_SIMD
+	virtual mfloat4 intersect_packet(const RayPacket &rays, mfloat4 &dists) const
 	{
-		__m128 results;
+		mfloat4 results;
 		((int*)&results)[0] = intersect(rays[0], ((float*)&dists)[0]) ? -1 : 0;
 		((int*)&results)[1] = intersect(rays[1], ((float*)&dists)[1]) ? -1 : 0;
 		((int*)&results)[2] = intersect(rays[2], ((float*)&dists)[2]) ? -1 : 0;
@@ -109,8 +109,8 @@
 	const Vector getCenter() const { return center; };
 	Float getRadius() const { return radius; };
 	ostream & dump(ostream &st) const;
-#ifndef NO_SSE
-	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
+#ifndef NO_SIMD
+	mfloat4 intersect_packet(const RayPacket &rays, mfloat4 &dists) const;
 #endif
 };
 
@@ -125,8 +125,8 @@
 	Box(const Vector &aL, const Vector &aH, Material *amaterial): L(aL), H(aH)
 	{
 		for (int i = 0; i < 3; i++)
-			if (L.cell[i] > H.cell[i])
-				swap(L.cell[i], H.cell[i]);
+			if (L[i] > H[i])
+				swap(L[i], H[i]);
 		material = amaterial;
 	};
 	bool intersect(const Ray &ray, Float &dist) const;
@@ -137,8 +137,8 @@
 	const Vector getL() const { return L; };
 	const Vector getH() const { return H; };
 	ostream & dump(ostream &st) const;
-#ifndef NO_SSE
-	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
+#ifndef NO_SIMD
+	mfloat4 intersect_packet(const RayPacket &rays, mfloat4 &dists) const;
 #endif
 };
 
@@ -149,6 +149,7 @@
 {
 public:
 	Vector P;
+
 	Vertex(const Vector &aP): P(aP) {};
 	virtual ~Vertex() {};
 	virtual ostream & dump(ostream &st) const;
@@ -161,6 +162,7 @@
 {
 public:
 	Vector N;
+
 	NormalVertex(const NormalVertex *v): Vertex(v->P), N(v->N) {};
 	NormalVertex(const Vector &aP): Vertex(aP) {};
 	NormalVertex(const Vector &aP, const Vector &aN): Vertex(aP), N(aN) {};
@@ -219,8 +221,8 @@
 	const Vector getNormal() const { return N; };
 	BBox get_bbox() const;
 	ostream & dump(ostream &st) const;
-#if not defined(NO_SSE) and defined(TRI_BARI_PRE)
-	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
+#if !defined(NO_SIMD) && defined(TRI_BARI_PRE)
+	mfloat4 intersect_packet(const RayPacket &rays, mfloat4 &dists) const;
 #endif
 };
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/simd.h	Mon May 05 15:31:14 2008 +0200
@@ -0,0 +1,82 @@
+/*
+ * simd.h: abstraction of Intel SSE instruction set
+ *
+ * This file is part of Pyrit Ray Tracer.
+ *
+ * Copyright 2008  Radek Brich
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef SIMD_H
+#define SIMD_H
+
+#include "common.h"
+
+#ifndef NO_SIMD
+
+#include <xmmintrin.h>
+
+typedef __m128 mfloat4;
+
+#define mZero  _mm_set_ps1(0.0f)
+#define mOne   _mm_set_ps1(1.0f)
+#define mTwo   _mm_set_ps1(2.0f)
+#define mEps   _mm_set_ps1(Eps)
+#define mMEps  _mm_set_ps1(-Eps)
+#define mInf   _mm_set_ps1(Inf)
+#define mMInf  _mm_set_ps1(-Inf)
+#define mAllSet  _mm_cmplt_ps(mZero, mOne)
+
+#define mset1 _mm_set_ps1
+#define mset _mm_set_ps
+
+#define madd _mm_add_ps
+#define msub _mm_sub_ps
+#define mmul _mm_mul_ps
+#define mdiv _mm_div_ps
+#define msqrt _mm_sqrt_ps
+
+#define mand _mm_and_ps
+#define mor  _mm_or_ps
+#define mcmpgt _mm_cmpgt_ps
+#define mcmplt _mm_cmplt_ps
+#define mcmpge _mm_cmpge_ps
+#define mcmple _mm_cmple_ps
+#define mcmpeq _mm_cmpeq_ps
+#define mcmpneq _mm_cmpneq_ps
+#define mmin _mm_min_ps
+#define mmax _mm_max_ps
+#define mmovemask _mm_movemask_ps
+
+inline const mfloat4 mselect(const mfloat4& mask, const mfloat4& a, const mfloat4& b)
+{
+	return _mm_or_ps(_mm_and_ps(mask, a), _mm_andnot_ps(mask, b));
+}
+
+inline const mfloat4 mfastpow(const mfloat4& base, const mfloat4& exponent)
+{
+    __m128 denom = _mm_mul_ps(exponent, base);
+    denom = _mm_sub_ps(exponent, denom);
+    denom = _mm_add_ps(base, denom);
+    return _mm_mul_ps(base, _mm_rcp_ps(denom));
+}
+#endif
+
+#endif
--- a/include/vector.h	Fri May 02 13:27:47 2008 +0200
+++ b/include/vector.h	Mon May 05 15:31:14 2008 +0200
@@ -3,7 +3,7 @@
  *
  * This file is part of Pyrit Ray Tracer.
  *
- * Copyright 2006, 2007  Radek Brich
+ * Copyright 2006, 2007, 2008  Radek Brich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,7 @@
 #include <iostream>
 
 #include "common.h"
+#include "simd.h"
 
 using namespace std;
 
@@ -42,8 +43,8 @@
 public:
 	// data
 	union {
-#ifndef NO_SSE
-		__m128 mps;
+#ifndef NO_SIMD
+		mfloat4 mf4;
 #endif
 		Float cell[4];
 		struct { Float x, y, z, w; };
@@ -51,16 +52,17 @@
 	};
 
 	// constructors
-#ifndef NO_SSE
-	Vector(__m128 m): mps(m) {};
+#ifndef NO_SIMD
+	Vector(mfloat4 m): mf4(m) {};
 #endif
 	Vector(): x(0.0f), y(0.0f), z(0.0f), w(1.0) {};
 	Vector(Float ax, Float ay, Float az): x(ax), y(ay), z(az), w(1.0) {};
 
 	// index operator
 	const Float &operator[](int index) const { return cell[index]; };
+	Float &operator[](int index) { return cell[index]; };
 
-	bool operator==(Vector &v) const { return x==v.x && y==v.y && z==v.z; };
+	bool operator==(const Vector &v) const { return x==v.x && y==v.y && z==v.z; };
 
 	// normalize
 	Vector normalize()
@@ -87,12 +89,12 @@
 	// accumulate
 	Vector operator+=(const Vector &v)
 	{
-#ifdef NO_SSE
+#ifdef NO_SIMD
 		x += v.x;
 		y += v.y;
 		z += v.z;
 #else
-		mps = _mm_add_ps(mps, v.mps);
+		mf4 = madd(mf4, v.mf4);
 #endif
 		return *this;
 	};
@@ -110,7 +112,7 @@
 	// cut
 	Vector operator/=(const Float &f)
 	{
-		Float finv = 1./f;
+		Float finv = 1.0f / f;
 		x *= finv;
 		y *= finv;
 		z *= finv;
@@ -120,20 +122,20 @@
 	// sum
 	friend Vector operator+(const Vector &a, const Vector &b)
 	{
-#ifdef NO_SSE
+#ifdef NO_SIMD
 		return Vector(a.x + b.x, a.y + b.y, a.z + b.z);
 #else
-		return Vector(_mm_add_ps(a.mps, b.mps));
+		return Vector(madd(a.mf4, b.mf4));
 #endif
 	};
 
 	// difference
 	friend Vector operator-(const Vector &a, const Vector &b)
 	{
-#ifdef NO_SSE
+#ifdef NO_SIMD
 		return Vector(a.x - b.x, a.y - b.y, a.z - b.z);
 #else
-		return Vector(_mm_sub_ps(a.mps, b.mps));
+		return Vector(msub(a.mf4, b.mf4));
 #endif
 	};
 
@@ -165,16 +167,16 @@
 	// scalar division
 	friend Vector operator/(const Vector &v, const Float &f)
 	{
-		const Float finv = 1./f;
+		const Float finv = 1.0f / f;
 		return Vector(v.x * finv, v.y * finv, v.z * finv);
 	};
 
 	friend Vector operator/(const Float &f, const Vector &v)
 	{
-#ifdef NO_SSE
+#ifdef NO_SIMD
 		return Vector(f / v.x, f / v.y, f / v.z);
 #else
-		return Vector(_mm_div_ps(_mm_set_ps1(f), v.mps));
+		return Vector(mdiv(mset1(f), v.mf4));
 #endif
 	};
 
@@ -193,10 +195,10 @@
 	// cell by cell product (only usable for colours)
 	friend Vector operator*(const Vector &a,  const Vector &b)
 	{
-#ifdef NO_SSE
+#ifdef NO_SIMD
 		return Vector(a.x * b.x, a.y * b.y, a.z * b.z);
 #else
-		return Vector(_mm_mul_ps(a.mps, b.mps));
+		return Vector(mmul(a.mf4, b.mf4));
 #endif
 	};
 
@@ -223,16 +225,16 @@
 
 typedef Vector Colour;
 
-#ifndef NO_SSE
+#ifndef NO_SIMD
 class VectorPacket
 {
 public:
 	union {
-		__m128 ma[3];
+		mfloat4 ma[3];
 		struct {
-			__m128 mx;
-			__m128 my;
-			__m128 mz;
+			mfloat4 mx;
+			mfloat4 my;
+			mfloat4 mz;
 		};
 		struct {
 			float x[4];
@@ -242,10 +244,10 @@
 	};
 
 	VectorPacket() {};
-	VectorPacket(__m128 ax, __m128 ay, __m128 az):
+	VectorPacket(mfloat4 ax, mfloat4 ay, mfloat4 az):
 		mx(ax), my(ay), mz(az) {};
 	VectorPacket(const Vector &v):
-		mx(_mm_set_ps1(v.x)), my(_mm_set_ps1(v.y)), mz(_mm_set_ps1(v.z)) {};
+		mx(mset1(v.x)), my(mset1(v.y)), mz(mset1(v.z)) {};
 
 	Vector getVector(int i) const
 	{
@@ -259,100 +261,92 @@
 
 	void normalize()
 	{
-		__m128 m,x,y,z;
-		x = _mm_mul_ps(mx, mx); // x*x
-		y = _mm_mul_ps(my, my); // y*y
-		z = _mm_mul_ps(mz, mz); // z*z
-		m = _mm_add_ps(x, y);
-		m = _mm_add_ps(m, z);     // x*x + y*y + z*z
-		m = _mm_sqrt_ps(m);
-		m = _mm_div_ps(mOne, m);   // m = 1/sqrt(m)
-		mx = _mm_mul_ps(mx, m);
-		my = _mm_mul_ps(my, m);
-		mz = _mm_mul_ps(mz, m);
+		mfloat4 m,x,y,z;
+		x = mmul(mx, mx); // x*x
+		y = mmul(my, my); // y*y
+		z = mmul(mz, mz); // z*z
+		m = madd(madd(x, y), z);     // x*x + y*y + z*z
+		m = mdiv(mOne, msqrt(m));   // m = 1/sqrt(m)
+		mx = mmul(mx, m);
+		my = mmul(my, m);
+		mz = mmul(mz, m);
 	};
 
 	// accumulate
 	VectorPacket operator+=(const VectorPacket &v)
 	{
-		mx = _mm_add_ps(mx, v.mx);
-		my = _mm_add_ps(my, v.my);
-		mz = _mm_add_ps(mz, v.mz);
+		mx = madd(mx, v.mx);
+		my = madd(my, v.my);
+		mz = madd(mz, v.mz);
 		return *this;
 	};
 
 	// add to non-masked components
-	VectorPacket selectiveAdd(__m128 mask, const VectorPacket &v)
+	VectorPacket selectiveAdd(const mfloat4 &mask, const VectorPacket &v)
 	{
-		mx = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mx, v.mx)),
-			_mm_andnot_ps(mask, mx));
-		my = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(my, v.my)),
-			_mm_andnot_ps(mask, my));
-		mz = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mz, v.mz)),
-			_mm_andnot_ps(mask, mz));
+		mx = mselect(mask, madd(mx, v.mx), mx);
+		my = mselect(mask, madd(my, v.my), my);
+		mz = mselect(mask, madd(mz, v.mz), mz);
 		return *this;
 	};
 
 	// add scalar to non-masked components
-	VectorPacket selectiveAdd(__m128 mask, const __m128 m)
+	VectorPacket selectiveAdd(const mfloat4 &mask, const mfloat4 &m)
 	{
-		mx = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mx, m)),
-			_mm_andnot_ps(mask, mx));
-		my = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(my, m)),
-			_mm_andnot_ps(mask, my));
-		mz = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mz, m)),
-			_mm_andnot_ps(mask, mz));
+		mx = mselect(mask, madd(mx, m), mx);
+		my = mselect(mask, madd(my, m), my);
+		mz = mselect(mask, madd(mz, m), mz);
 		return *this;
 	};
 
 	// dot product
-	friend __m128 dot(const VectorPacket &a, const VectorPacket &b)
+	friend mfloat4 dot(const VectorPacket &a, const VectorPacket &b)
 	{
-		return _mm_add_ps(_mm_add_ps(
-			_mm_mul_ps(a.mx, b.mx),
-			_mm_mul_ps(a.my, b.my)),
-			_mm_mul_ps(a.mz, b.mz));
+		return madd(madd(
+			mmul(a.mx, b.mx),
+			mmul(a.my, b.my)),
+			mmul(a.mz, b.mz));
 	};
 
 	friend VectorPacket operator+(const VectorPacket &a, const VectorPacket &b)
 	{
 		return VectorPacket(
-			_mm_add_ps(a.mx, b.mx),
-			_mm_add_ps(a.my, b.my),
-			_mm_add_ps(a.mz, b.mz));
+			madd(a.mx, b.mx),
+			madd(a.my, b.my),
+			madd(a.mz, b.mz));
 	};
 
 	friend VectorPacket operator-(const VectorPacket &a, const VectorPacket &b)
 	{
 		return VectorPacket(
-			_mm_sub_ps(a.mx, b.mx),
-			_mm_sub_ps(a.my, b.my),
-			_mm_sub_ps(a.mz, b.mz));
+			msub(a.mx, b.mx),
+			msub(a.my, b.my),
+			msub(a.mz, b.mz));
 	};
 
-	friend VectorPacket operator*(const VectorPacket &v,  const __m128 &m)
+	friend VectorPacket operator*(const VectorPacket &v,  const mfloat4 &m)
 	{
 		return VectorPacket(
-			_mm_mul_ps(v.mx, m),
-			_mm_mul_ps(v.my, m),
-			_mm_mul_ps(v.mz, m));
+			mmul(v.mx, m),
+			mmul(v.my, m),
+			mmul(v.mz, m));
 	};
 
-	friend VectorPacket operator/(const __m128 &m, const VectorPacket &v)
+	friend VectorPacket operator/(const mfloat4 &m, const VectorPacket &v)
 	{
 		return VectorPacket(
-			_mm_div_ps(m, v.mx),
-			_mm_div_ps(m, v.my),
-			_mm_div_ps(m, v.mz));
+			mdiv(m, v.mx),
+			mdiv(m, v.my),
+			mdiv(m, v.mz));
 	};
 
 	// cell by cell product (only usable for colours)
 	friend VectorPacket operator*(const VectorPacket &a,  const VectorPacket &b)
 	{
 		return VectorPacket(
-			_mm_mul_ps(a.mx, b.mx),
-			_mm_mul_ps(a.my, b.my),
-			_mm_mul_ps(a.mz, b.mz));
+			mmul(a.mx, b.mx),
+			mmul(a.my, b.my),
+			mmul(a.mz, b.mz));
 	};
 
 	// write to character stream
--- a/models/SConscript	Fri May 02 13:27:47 2008 +0200
+++ b/models/SConscript	Mon May 05 15:31:14 2008 +0200
@@ -1,15 +1,18 @@
-env = Environment()
-env.Append(BUILDERS = {'Copy':Builder(action=Copy('$TARGET','$SOURCE'), single_source=True)})
+Import('env')
+
+myenv = Environment()
+myenv.Append(BUILDERS = {'Copy':Builder(action=Copy('$TARGET','$SOURCE'), single_source=True)})
 files = ['lwo/Nissan300ZX.lwo', 'obj/cube.obj', 'obj/sphere.obj', 'obj/monkey.obj']
 
 l = []
 for file in files:
-	l.append( env.Copy('#build/models/'+file, file) )
+	l.append( myenv.Copy('#build/models/'+file, file) )
 
 env.Alias('local-models', l)
 
 plydir = Dir('#build/models/ply/')
-Command(plydir, [], Dir('#models/').abspath+'/download-stanford $TARGET')
+import os
+env.Command(plydir, [], '"'+Dir('#models/').abspath+os.sep+'download-stanford" $TARGET')
 Clean(plydir, plydir)
 
 env.Alias('download-models', plydir)
--- a/src/SConscript	Fri May 02 13:27:47 2008 +0200
+++ b/src/SConscript	Mon May 05 15:31:14 2008 +0200
@@ -1,7 +1,7 @@
-Import('env buildmodule')
+Import('env buildmodule cc')
 
-env = env.Clone(CPPPATH = '#include')
-pyenv = env.Clone()
+myenv = env.Clone(CPPPATH = '#include')
+pyenv = myenv.Clone()
 if env['PLATFORM'] == 'win32':
 	import sys
 	pythonver = '%c%c' % (sys.version[0], sys.version[2])
@@ -23,19 +23,22 @@
 objs = []
 shared_objs = []
 for src in sources:
-	objs.append( env.Object(src) )
-	shared_objs.append( env.SharedObject(src) )
+	objs.append( myenv.Object(src) )
+	shared_objs.append( myenv.SharedObject(src) )
 
 if buildmodule:
+	if cc == 'gcc':
+		ccflags = '$CCFLAGS -Wno-write-strings'
+	else:
+		ccflags = '$CCFLAGS'
 	pymodule = pyenv.SharedLibrary('pyrit',
 		['raytracermodule.cc']+shared_objs,
-		SHLIBPREFIX = '',
-		CCFLAGS = '$CCFLAGS -Wno-write-strings')
+		SHLIBPREFIX = '', CCFLAGS = ccflags)
 	env.Alias('shared-objs', shared_objs)
 	env.Alias('python-module', pymodule)
 	Return('pymodule')
 else:
-	lib = env.StaticLibrary('pyrit', objs)
+	lib = myenv.StaticLibrary('pyrit', objs)
 	env.Alias('objs', objs)
 	env.Alias('static-lib', lib)
 	Return('lib')
--- a/src/container.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/container.cc	Mon May 05 15:31:14 2008 +0200
@@ -60,8 +60,8 @@
 	return nearest_shape;
 }
 
-#ifndef NO_SSE
-void Container::packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
+#ifndef NO_SIMD
+void Container::packet_intersection(const Shape* const* origin_shapes, const RayPacket &rays,
 	Float *nearest_distances, Shape **nearest_shapes)
 {
 	for (int i = 0; i < 4; i++)
--- a/src/kdtree.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/kdtree.cc	Mon May 05 15:31:14 2008 +0200
@@ -64,12 +64,10 @@
 {
 	if (isLeaf())
 		delete getShapes();
-	else
-		delete[] getLeftChild();
 }
 
 // kd-tree recursive build algorithm, inspired by PBRT (www.pbrt.org)
-void KdTree::recursive_build(KdNode *node, BBox bounds, int maxdepth)
+void KdTree::recursive_build(KdNode *node, const BBox &bounds, int maxdepth)
 {
 	ShapeList *shapes = node->getShapes();
 
@@ -102,12 +100,12 @@
 		sort(edges[ax].begin(), edges[ax].end());
 
 	// choose best split pos
-	const Float K = 1.4; // constant, K = cost of traversal / cost of ray-triangle intersection
+	const Float K = 1.4f; // constant, K = cost of traversal / cost of ray-triangle intersection
 	Float SAV = (bounds.w()*bounds.h() +  // surface area of node
 		bounds.w()*bounds.d() + bounds.h()*bounds.d());
 	Float cost = SAV * (K + shapes->size()); // initial cost = non-split cost
 
-	vector<ShapeBound>::iterator edge, splitedge = edges[2].end();
+	vector<ShapeBound>::iterator edge, splitedge = edges[0].end();
 	int axis = 0;
 	for (int ax = 0; ax < 3; ax++)
 	{
@@ -120,8 +118,8 @@
 				rnum--;
 
 			// calculate SAH cost of this split
-			lbb.H.cell[ax] = edge->pos;
-			rbb.L.cell[ax] = edge->pos;
+			lbb.H[ax] = edge->pos;
+			rbb.L[ax] = edge->pos;
 			Float SAL = (lbb.w()*lbb.h() + lbb.w()*lbb.d() + lbb.h()*lbb.d());
 			Float SAR = (rbb.w()*rbb.h() + rbb.w()*rbb.d() + rbb.h()*rbb.d());
 			Float splitcost = K*SAV + SAL*(K + lnum) + SAR*(K + rnum);
@@ -138,7 +136,10 @@
 		}
 	}
 
-	if (splitedge == edges[2].end())
+	// we actually need to compare with edges[0].end(), but
+	// MSVC does not allow comparison of iterators from differen instances of vector
+	// it's OK this way, because axis will be zero if no good split was found
+	if (splitedge == edges[axis].end())
 	{
 		node->setLeaf();
 		return;
@@ -152,18 +153,18 @@
 	static ofstream F("kdtree.obj");
 	Vector v;
 	static int f=0;
-	v.cell[axis] = node->getSplit();
-	v.cell[(axis+1)%3] = bounds.L.cell[(axis+1)%3];
-	v.cell[(axis+2)%3] = bounds.L.cell[(axis+2)%3];
+	v[axis] = node->getSplit();
+	v[(axis+1)%3] = bounds.L[(axis+1)%3];
+	v[(axis+2)%3] = bounds.L[(axis+2)%3];
 	F << "v " << v.x << " " << v.y << " " << v.z << endl;
-	v.cell[(axis+1)%3] = bounds.L.cell[(axis+1)%3];
-	v.cell[(axis+2)%3] = bounds.H.cell[(axis+2)%3];
+	v[(axis+1)%3] = bounds.L[(axis+1)%3];
+	v[(axis+2)%3] = bounds.H[(axis+2)%3];
 	F << "v " << v.x << " " << v.y << " " << v.z << endl;
-	v.cell[(axis+1)%3] = bounds.H.cell[(axis+1)%3];
-	v.cell[(axis+2)%3] = bounds.H.cell[(axis+2)%3];
+	v[(axis+1)%3] = bounds.H[(axis+1)%3];
+	v[(axis+2)%3] = bounds.H[(axis+2)%3];
 	F << "v " << v.x << " " << v.y << " " << v.z << endl;
-	v.cell[(axis+1)%3] = bounds.H.cell[(axis+1)%3];
-	v.cell[(axis+2)%3] = bounds.L.cell[(axis+2)%3];
+	v[(axis+1)%3] = bounds.H[(axis+1)%3];
+	v[(axis+2)%3] = bounds.L[(axis+2)%3];
 	F << "v " << v.x << " " << v.y << " " << v.z << endl;
 	F << "f " << f+1 << " " << f+2 << " " << f+3 << " " << f+4 << endl;
 	f += 4;
@@ -174,9 +175,10 @@
 
 	BBox lbb = bounds;
 	BBox rbb = bounds;
-	lbb.H.cell[axis] = node->getSplit();
-	rbb.L.cell[axis] = node->getSplit();
-	node->setChildren(new KdNode[2]);
+	lbb.H[axis] = node->getSplit();
+	rbb.L[axis] = node->getSplit();
+	node->setChildren(new (mempool.alloc()) KdNode);
+	new (mempool.alloc()) KdNode;
 	node->setAxis(axis);
 
 	for (edge = edges[axis].begin(); edge != splitedge; edge++)
@@ -219,7 +221,12 @@
 	KdNode *farchild, *node;
 	node = root;
 
+#ifdef MSVC
+	// MSVC wants constant expression here... hope it won't overflow :)
+	StackElem stack[64];
+#else
 	StackElem stack[max_depth];
+#endif
 
 	int entry = 0, exit = 1;
 	stack[entry].t = a;
@@ -294,11 +301,11 @@
 			stack[exit].prev = tmp;
 			stack[exit].t = t;
 			stack[exit].node = farchild;
-			stack[exit].pb.cell[axis] = splitVal;
-			stack[exit].pb.cell[mod3[axis+1]] = ray.o.cell[mod3[axis+1]]
-				+ t * ray.dir.cell[mod3[axis+1]];
-			stack[exit].pb.cell[mod3[axis+2]] = ray.o.cell[mod3[axis+2]]
-				+ t * ray.dir.cell[mod3[axis+2]];
+			stack[exit].pb[axis] = splitVal;
+			stack[exit].pb[mod3[axis+1]] = ray.o[mod3[axis+1]]
+				+ t * ray.dir[mod3[axis+1]];
+			stack[exit].pb[mod3[axis+2]] = ray.o[mod3[axis+2]]
+				+ t * ray.dir[mod3[axis+2]];
 		}
 
 		/* current node is the leaf . . . empty or full */
@@ -328,22 +335,22 @@
 	return NULL;
 }
 
-#ifndef NO_SSE
+#ifndef NO_SIMD
 // stack element for kd-tree traversal (packet version)
 struct StackElem4
 {
 	KdNode* node; /* pointer to far child */
-	__m128 t; /* the entry/exit signed distance */
+	mfloat4 t; /* the entry/exit signed distance */
 	VectorPacket pb; /* the coordinates of entry/exit point */
 	int prev;
 };
 
-void KdTree::packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
+void KdTree::packet_intersection(const Shape* const* origin_shapes, const RayPacket &rays,
 		Float *nearest_distances, Shape **nearest_shapes)
 {
-	__m128 a, b; /* entry/exit signed distance */
-	__m128 t;    /* signed distance to the splitting plane */
-	__m128 mask = mZero;
+	mfloat4 a, b; /* entry/exit signed distance */
+	mfloat4 t;    /* signed distance to the splitting plane */
+	mfloat4 mask = mZero;
 
 	/* if we have no tree, fall back to naive test */
 	if (!built)
@@ -353,27 +360,29 @@
 	memset(nearest_shapes, 0, 4*sizeof(Shape*));
 
 	mask = bbox.intersect_packet(rays, a, b);
-	if (!_mm_movemask_ps(mask))
+	if (!mmovemask(mask))
 		return;
 
 	/* pointers to the far child node and current node */
 	KdNode *farchild, *node;
 	node = root;
 
+#ifdef MSVC
+	// MSVC wants constant expression here... hope it won't overflow :)
+	StackElem4 stack[64];
+#else
 	StackElem4 stack[max_depth];
+#endif
 
 	int entry = 0, exit = 1;
 	stack[entry].t = a;
 
 	/* distinguish between internal and external origin of a ray*/
-	t = _mm_cmplt_ps(a, mZero);
+	t = mcmplt(a, mZero);
 	stack[entry].pb = rays.o + rays.dir * a;
-	stack[entry].pb.mx = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.mx),
-		_mm_and_ps(t, rays.o.mx));
-	stack[entry].pb.my = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.my),
-		_mm_and_ps(t, rays.o.my));
-	stack[entry].pb.mz = _mm_or_ps(_mm_andnot_ps(t, stack[entry].pb.mz),
-		_mm_and_ps(t, rays.o.mz));
+	stack[entry].pb.mx = mselect(t, rays.o.mx, stack[entry].pb.mx);
+	stack[entry].pb.my = mselect(t, rays.o.my, stack[entry].pb.my);
+	stack[entry].pb.mz = mselect(t, rays.o.mz, stack[entry].pb.mz);
 
 	/* setup initial exit point in the stack */
 	stack[exit].t = b;
@@ -382,7 +391,7 @@
 
 	/* loop, traverse through the whole kd-tree,
 	until an object is intersected or ray leaves the scene */
-	__m128 splitVal;
+	mfloat4 splitVal;
 	int axis;
 	static const int mod3[] = {0,1,2,0,1};
 	const VectorPacket invdirs = mOne / rays.dir;
@@ -392,21 +401,21 @@
 		while (!node->isLeaf())
 		{
 			/* retrieve position of splitting plane */
-			splitVal = _mm_set_ps1(node->getSplit());
+			splitVal = mset1(node->getSplit());
 			axis = node->getAxis();
 
 			// mask out invalid rays with near > far
-			const __m128 curmask = _mm_and_ps(mask, _mm_cmple_ps(stack[entry].t, stack[exit].t));
-			const __m128 entry_lt = _mm_cmplt_ps(stack[entry].pb.ma[axis], splitVal);
-			const __m128 entry_gt = _mm_cmpgt_ps(stack[entry].pb.ma[axis], splitVal);
-			const __m128 exit_lt = _mm_cmplt_ps(stack[exit].pb.ma[axis], splitVal);
-			const __m128 exit_gt = _mm_cmpgt_ps(stack[exit].pb.ma[axis], splitVal);
+			const mfloat4 curmask = mand(mask, mcmple(stack[entry].t, stack[exit].t));
+			const mfloat4 entry_lt = mcmplt(stack[entry].pb.ma[axis], splitVal);
+			const mfloat4 entry_gt = mcmpgt(stack[entry].pb.ma[axis], splitVal);
+			const mfloat4 exit_lt = mcmplt(stack[exit].pb.ma[axis], splitVal);
+			const mfloat4 exit_gt = mcmpgt(stack[exit].pb.ma[axis], splitVal);
 
 			// if all of
 			// stack[entry].pb[axis] <= splitVal,
 			// stack[exit].pb[axis] <= splitVal
-			if (!_mm_movemask_ps(
-				_mm_and_ps(_mm_or_ps(entry_gt, exit_gt), curmask)))
+			if (!mmovemask(
+				mand(mor(entry_gt, exit_gt), curmask)))
 			{
 				node = node->getLeftChild();
 				continue;
@@ -415,8 +424,8 @@
 			// if all of
 			// stack[entry].pb[axis] >= splitVal,
 			// stack[exit].pb[axis] >= splitVal
-			if (!_mm_movemask_ps(
-				_mm_and_ps(_mm_or_ps(entry_lt, exit_lt), curmask)))
+			if (!mmovemask(
+				mand(mor(entry_lt, exit_lt), curmask)))
 			{
 				node = node->getRightChild();
 				continue;
@@ -425,14 +434,14 @@
 			// any of
 			// stack[entry].pb[axis] < splitVal,
 			// stack[exit].pb[axis] > splitVal
-			bool cond1 = _mm_movemask_ps(
-				_mm_and_ps(_mm_and_ps(entry_lt, exit_gt), curmask));
+			int cond1 = mmovemask(
+				mand(mand(entry_lt, exit_gt), curmask));
 
 			// any of
 			// stack[entry].pb[axis] > splitVal,
 			// stack[exit].pb[axis] < splitVal
-			bool cond2 = _mm_movemask_ps(
-				_mm_and_ps(_mm_and_ps(entry_gt, exit_lt), curmask));
+			int cond2 = mmovemask(
+				mand(mand(entry_gt, exit_lt), curmask));
 
 			if ((!cond1 && !cond2) || (cond1 && cond2))
 			{
@@ -459,7 +468,7 @@
 			/* traverse both children */
 
 			/* signed distance to the splitting plane */
-			t = _mm_mul_ps(_mm_sub_ps(splitVal, rays.o.ma[axis]), invdirs.ma[axis]);
+			t = mmul(msub(splitVal, rays.o.ma[axis]), invdirs.ma[axis]);
 
 			/* setup the new exit point and push it onto stack */
 			register int tmp = exit;
@@ -474,22 +483,22 @@
 			stack[exit].node = farchild;
 			stack[exit].pb.ma[axis] = splitVal;
 			stack[exit].pb.ma[mod3[axis+1]] =
-				_mm_add_ps(rays.o.ma[mod3[axis+1]], _mm_mul_ps(t, rays.dir.ma[mod3[axis+1]]));
+				madd(rays.o.ma[mod3[axis+1]], mmul(t, rays.dir.ma[mod3[axis+1]]));
 			stack[exit].pb.ma[mod3[axis+2]] =
-				_mm_add_ps(rays.o.ma[mod3[axis+2]], _mm_mul_ps(t, rays.dir.ma[mod3[axis+2]]));
+				madd(rays.o.ma[mod3[axis+2]], mmul(t, rays.dir.ma[mod3[axis+2]]));
 		}
 
 		/* current node is the leaf . . . empty or full */
-		__m128 dists = stack[exit].t;
+		mfloat4 dists = stack[exit].t;
 		ShapeList::iterator shape;
-		__m128 results;
-		__m128 newmask = mask;
+		mfloat4 results;
+		mfloat4 newmask = mask;
 		for (shape = node->getShapes()->begin(); shape != node->getShapes()->end(); shape++)
 		{
 			results = (*shape)->intersect_packet(rays, dists);
-			int valid = _mm_movemask_ps(
-				_mm_and_ps(mask, _mm_and_ps(results,
-				_mm_cmpge_ps(dists, _mm_sub_ps(stack[entry].t, mEps)))));
+			int valid = mmovemask(
+				mand(mask, mand(results,
+				mcmpge(dists, msub(stack[entry].t, mEps)))));
 			for (int i = 0; i < 4; i++)
 			{
 				if (*shape != origin_shapes[i] && ((valid>>i)&1))
@@ -502,7 +511,7 @@
 		}
 
 		mask = newmask;
-		if (!_mm_movemask_ps(mask))
+		if (!mmovemask(mask))
 			return;
 
 		entry = exit;
--- a/src/octree.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/octree.cc	Mon May 05 15:31:14 2008 +0200
@@ -37,7 +37,7 @@
 		delete[] children;
 }
 
-void OctreeNode::subdivide(BBox bbox, int maxdepth)
+void OctreeNode::subdivide(const BBox &bbox, int maxdepth)
 {
 	ShapeList *l_shapes = getShapes();
 
@@ -45,9 +45,9 @@
 	makeChildren();
 
 	// evaluate centres for axes
-	const Float xsplit = (bbox.L.x + bbox.H.x)*0.5;
-	const Float ysplit = (bbox.L.y + bbox.H.y)*0.5;
-	const Float zsplit = (bbox.L.z + bbox.H.z)*0.5;
+	const Float xsplit = (bbox.L.x + bbox.H.x)*0.5f;
+	const Float ysplit = (bbox.L.y + bbox.H.y)*0.5f;
+	const Float zsplit = (bbox.L.z + bbox.H.z)*0.5f;
 
 	// set bounding boxes for children
 	BBox childbb[8] = {bbox, bbox, bbox, bbox, bbox, bbox, bbox, bbox};
@@ -155,7 +155,12 @@
 	if (!built)
 		return Container::nearest_intersection(origin_shape, ray, nearest_distance);
 
+#ifdef MSVC
+	// MSVC wants constant expression here... hope it won't overflow :)
+	OctreeTravState st[20];
+#else
 	OctreeTravState st[max_depth+1];
+#endif
 	register OctreeTravState *st_cur = st;
 
 #	define node	st_cur->node
@@ -192,12 +197,12 @@
 		a |= 1;
 	}
 
-	if (rdir.x == 0.0) rdir.x = Eps;
-	if (rdir.y == 0.0) rdir.y = Eps;
-	if (rdir.z == 0.0) rdir.z = Eps;
-	rdir.x = 1.0/rdir.x;
-	rdir.y = 1.0/rdir.y;
-	rdir.z = 1.0/rdir.z;
+	if (rdir.x == 0.0f) rdir.x = Eps;
+	if (rdir.y == 0.0f) rdir.y = Eps;
+	if (rdir.z == 0.0f) rdir.z = Eps;
+	rdir.x = 1.0f/rdir.x;
+	rdir.y = 1.0f/rdir.y;
+	rdir.z = 1.0f/rdir.z;
 
 	tx0 = (bbox.L.x - ro.x) * rdir.x;
 	tx1 = (bbox.H.x - ro.x) * rdir.x;
@@ -238,9 +243,9 @@
 				}
 				else
 				{
-					txm = 0.5 * (tx0+tx1);
-					tym = 0.5 * (ty0+ty1);
-					tzm = 0.5 * (tz0+tz1);
+					txm = 0.5f * (tx0+tx1);
+					tym = 0.5f * (ty0+ty1);
+					tzm = 0.5f * (tz0+tz1);
 
 					// first node
 					st_cur->next = 0;
--- a/src/pixmap.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/pixmap.cc	Mon May 05 15:31:14 2008 +0200
@@ -26,18 +26,19 @@
 
 #include <stdio.h>
 
+#include "config.h"
+#include "pixmap.h"
+
 #ifdef HAVE_PNG
 #	include <png.h>
 #endif
 
-#include "pixmap.h"
-
 unsigned char *Pixmap::getCharData() const
 {
 	unsigned char *cdata = new unsigned char[w*h*3];
 	Float *fd = fdata;
 
-#ifdef NO_SSE
+#if 1 //def NO_SIMD
 	for (unsigned char *cd = cdata; cd != cdata + w*h*3; cd++, fd++)
 		if (*fd > 1.0)
 			*cd = 255;
@@ -49,7 +50,7 @@
 	for (unsigned char *cd = cdata; cd < cdata + w*h*3; cd += 4, fd += 4)
 	{
 		m = _mm_cvtps_pi16(_mm_mul_ps(cmax,
-			_mm_min_ps(mOne, _mm_set_ps(fd[3],fd[2],fd[1],fd[0]))));
+			_mm_min_ps(mOne, _mm_load_ps(fd))));
 		for (int i = 0; i < 4; i++)
 			cd[i] = ((unsigned char *)&m)[i<<1];
 	}
--- a/src/raytracer.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/raytracer.cc	Mon May 05 15:31:14 2008 +0200
@@ -120,15 +120,15 @@
 	return acc;
 }
 
-#ifndef NO_SSE
-VectorPacket Raytracer::PhongShader_packet(const Shape **shapes,
+#ifndef NO_SIMD
+VectorPacket Raytracer::PhongShader_packet(const Shape* const* shapes,
 	const VectorPacket &P, const VectorPacket &N, const VectorPacket &V)
 {
 	VectorPacket acc, colour;
-	union { __m128 ambient; float ambient_f[4]; };
-	union { __m128 diffuse; float diffuse_f[4]; };
-	union { __m128 specular; float specular_f[4]; };
-	union { __m128 shininess; float shininess_f[4]; };
+	union { mfloat4 ambient; float ambient_f[4]; };
+	union { mfloat4 diffuse; float diffuse_f[4]; };
+	union { mfloat4 specular; float specular_f[4]; };
+	union { mfloat4 shininess; float shininess_f[4]; };
 
 	for (int i = 0; i < 4; i++)
 		if (shapes[i] == NULL)
@@ -154,38 +154,38 @@
 	// ambient
 	acc = colour * ambient;
 
-	Shape **shadow_shapes;
+	Shape *shadow_shapes[4];
 	vector<Light*>::iterator light;
 	for (light = lights.begin(); light != lights.end(); light++)
 	{
 		 // direction vector to light
 		VectorPacket L = VectorPacket((*light)->pos) - P;
 		L.normalize();
-		const __m128 L_dot_N = dot(L, N);
-		__m128 valid = _mm_cmpgt_ps(L_dot_N, mZero);
+		const mfloat4 L_dot_N = dot(L, N);
+		mfloat4 valid = mcmpgt(L_dot_N, mZero);
 
 		// test if this light is occluded (sharp shadows)
 		if ((*light)->cast_shadows)
 		{
 			const RayPacket shadow_rays = RayPacket(P, L);
-			union { __m128 dists; float dists_f[4]; };
+			union { mfloat4 dists; float dists_f[4]; };
 			dists = mInf;
 			top->packet_intersection(shapes, shadow_rays,
 				dists_f, shadow_shapes);
-			valid = _mm_and_ps(valid, _mm_cmpeq_ps(dists, mInf));
+			valid = mand(valid, mcmpeq(dists, mInf));
 		}
 
-		const VectorPacket R = L - N * _mm_mul_ps(mTwo, L_dot_N);
-		const __m128 R_dot_V = dot(R, V);
+		const VectorPacket R = L - N * mmul(mTwo, L_dot_N);
+		const mfloat4 R_dot_V = dot(R, V);
 
 		// diffuse
 		acc.selectiveAdd(valid,
-			colour * VectorPacket((*light)->colour) * _mm_mul_ps(diffuse, L_dot_N));
+			colour * VectorPacket((*light)->colour) * mmul(diffuse, L_dot_N));
 
 		// specular
-		valid = _mm_and_ps(valid, _mm_cmpgt_ps(R_dot_V, mZero));
-		__m128 spec = _mm_mul_ps(_mm_mul_ps(specular, _mm_set_ps1((*light)->colour.r)),
-			_mm_fastpow(R_dot_V, shininess));
+		valid = mand(valid, mcmpgt(R_dot_V, mZero));
+		mfloat4 spec = mmul(mmul(specular, mset1((*light)->colour.r)),
+			mfastpow(R_dot_V, shininess));
 		acc.selectiveAdd(valid, spec);
 	}
 	return acc;
@@ -297,23 +297,22 @@
 	}
 }
 
-#ifndef NO_SSE
+#ifndef NO_SIMD
 void Raytracer::raytracePacket(RayPacket &rays, Colour *results)
 {
 	union {
 		float nearest_distances[4];
-		__m128 m_nearest_distances;
+		mfloat4 m_nearest_distances;
 	};
-	__m128 mask;
+	mfloat4 mask;
 	Shape *nearest_shapes[4];
 	static const Shape *origin_shapes[4] = {NULL, NULL, NULL, NULL};
 	m_nearest_distances = mInf;
-	mask = mAllSet;
 
 	top->packet_intersection(origin_shapes, rays, nearest_distances, nearest_shapes);
 
-	mask = _mm_cmpneq_ps(m_nearest_distances, mInf);
-	if (!_mm_movemask_ps(mask))
+	mask = mcmpneq(m_nearest_distances, mInf);
+	if (!mmovemask(mask))
 	{
 		for (int i = 0; i < 4; i++)
 			results[i] = bg_colour;
@@ -321,34 +320,29 @@
 	}
 
 	const VectorPacket P = rays.o + rays.dir * m_nearest_distances; // point of intersection
-
 	VectorPacket normal;
 	for (int i = 0; i < 4; i++)
 		if (nearest_shapes[i] != NULL)
 			normal.setVector(i, nearest_shapes[i]->normal(P.getVector(i)));
 
 	// make shapes double sided
-	__m128 from_inside = _mm_cmpgt_ps(dot(normal, rays.dir), mZero);
-	normal.mx = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.mx)),
-		_mm_andnot_ps(from_inside, normal.mx));
-	normal.my = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.my)),
-		_mm_andnot_ps(from_inside, normal.my));
-	normal.mz = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.mz)),
-		_mm_andnot_ps(from_inside, normal.mz));
+	mfloat4 from_inside = mcmpgt(dot(normal, rays.dir), mZero);
+	normal.mx = mselect(from_inside, msub(mZero, normal.mx), normal.mx);
+	normal.my = mselect(from_inside, msub(mZero, normal.my), normal.my);
+	normal.mz = mselect(from_inside, msub(mZero, normal.mz), normal.mz);
 
 	// shading function
-	VectorPacket pres =
-		PhongShader_packet(const_cast<const Shape**>(nearest_shapes), P, normal, rays.dir);
-	//pres.mx = _mm_or_ps(_mm_and_ps(mask, pres.mx), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.r)));
-	//pres.my = _mm_or_ps(_mm_and_ps(mask, pres.my), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.g)));
-	//pres.mz = _mm_or_ps(_mm_and_ps(mask, pres.mz), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.b)));
+	VectorPacket pres = PhongShader_packet(nearest_shapes, P, normal, rays.dir);
+	//pres.mx = mselect(mask, pres.mx, mset1(bg_colour.r));
+	//pres.my = mselect(mask, pres.my, mset1(bg_colour.g));
+	//pres.mz = mselect(mask, pres.mz, mset1(bg_colour.b));
 
 	for (int i = 0; i < 4; i++)
 		if (nearest_shapes[i] != NULL)
 		{
 			results[i] = pres.getVector(i);
 			lightScatter(rays[i], nearest_shapes[i], 0,
-				P.getVector(i), normal.getVector(i), (_mm_movemask_ps(from_inside)>>i)&1,
+				P.getVector(i), normal.getVector(i), (mmovemask(from_inside)>>i)&1,
 				results[i]);
 		}
 		else
@@ -364,7 +358,7 @@
 	Colour my_colours[my_queue_size];
 	int my_count;
 	Ray ray;
-#ifndef NO_SSE
+#ifndef NO_SIMD
 	RayPacket rays;
 	const bool can_use_packets = (rt->use_packets && rt->sampler->packetableSamples());
 #endif
@@ -407,7 +401,7 @@
 		pthread_mutex_unlock(&rt->sample_queue_mutex);
 
 		// do the work
-#ifndef NO_SSE
+#ifndef NO_SIMD
 		if (can_use_packets)
 		{
 			// packet ray tracing
@@ -435,6 +429,7 @@
 			rt->sampler->saveSample(my_queue[i], my_colours[i]);
 		pthread_mutex_unlock(&rt->sampler_mutex);
 	}
+	return NULL;
 }
 
 void Raytracer::render()
@@ -455,7 +450,7 @@
 
 	// create workers
 	dbgmsg(1, "* using %d threads\n", num_threads);
-	pthread_t threads[num_threads];
+	pthread_t *threads = new pthread_t[num_threads];
 
 	dbgmsg(1, "* raytracing...\n");
 
@@ -536,6 +531,7 @@
 		phase ++;
 	}
 
+	delete[] threads;
 	delete[] sample_queue;
 }
 
--- a/src/sampler.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/sampler.cc	Mon May 05 15:31:14 2008 +0200
@@ -223,19 +223,10 @@
 			}
 			else
 			{
-				sx++;
-				if (sx >= w)
-				{
-					sx = 0;
-					sy++;
-				}
-				if (sy >= h)
-					return false;
-
 				if (subsample > 1)
 				{
 					// find next not interpolated pixel
-					while ( *(buffer + 3*(sy*w + sx)) >= 0. )
+					do
 					{
 						sx++;
 						if (sx >= w)
@@ -246,6 +237,46 @@
 						if (sy >= h)
 							return false;
 					}
+					while ( *(buffer + 3*(sy*w + sx)) >= 0. );
+				}
+				else if (!oversample && !(w&1) && !(h&1))
+				{
+					// generate good raster for packet tracing
+					const int j = ((sy&1)<<1) + (sx&1);
+					switch (j)
+					{
+						case 0:
+						case 2:
+							sx++;
+							break;
+						case 1:
+							sx--;
+							sy++;
+							break;
+						case 3:
+							sx++;
+							if (sx >= w)
+							{
+								sx = 0;
+								sy++;
+							}
+							else
+								sy--;
+							if (sy >= h)
+								return false;
+							break;
+					}
+				}
+				else
+				{
+					sx++;
+					if (sx >= w)
+					{
+						sx = 0;
+						sy++;
+					}
+					if (sy >= h)
+						return false;
 				}
 
 				s->x = (Float)sx/h - (Float)w/h/2.0;
--- a/src/scene.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/scene.cc	Mon May 05 15:31:14 2008 +0200
@@ -73,7 +73,7 @@
 }
 
 /* http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm */
-bool BBox::intersect(const Ray &ray, Float &a, Float &b)
+bool BBox::intersect(const Ray &ray, Float &a, Float &b) const
 {
 	register Float tnear = -Inf;
 	register Float tfar = Inf;
@@ -108,46 +108,33 @@
 	return true;
 }
 
-#ifndef NO_SSE
+#ifndef NO_SIMD
 // rewrite of BBox::intersect for ray packets
-__m128 BBox::intersect_packet(const RayPacket &rays, __m128 &a, __m128 &b)
+mfloat4 BBox::intersect_packet(const RayPacket &rays, mfloat4 &a, mfloat4 &b) const
 {
-	register __m128 tnear = mZero;
-	register __m128 tfar = mInf;
-	register __m128 t1, t2;
-	register __m128 mask = mAllSet;
-
-	for (int i = 0; i < 3; i++)
-	{
-		const __m128 mL = _mm_set_ps1(L[i]);
-		const __m128 mH = _mm_set_ps1(H[i]);
-		mask = _mm_and_ps(mask,
-		_mm_or_ps(
-		_mm_or_ps(_mm_cmplt_ps(rays.dir.ma[i], mMEps), _mm_cmpgt_ps(rays.dir.ma[i], mEps)),
-		_mm_and_ps(_mm_cmpge_ps(rays.o.ma[i], mL), _mm_cmple_ps(rays.o.ma[i], mH))
-		));
-		if (!_mm_movemask_ps(mask))
-			return mask;
+	mfloat4 origin = rays.o.ma[0];
+	mfloat4 invdir = rays.invdir.ma[0];
+	mfloat4 t1 = mmul(msub(mset1(L[0]), origin), invdir);
+	mfloat4 t2 = mmul(msub(mset1(H[0]), origin), invdir);
+	mfloat4 tmin = mmin(t1, t2);
+	mfloat4 tmax = mmax(t1, t2);
 
-		/* compute the intersection distance of the planes */
-		t1 = _mm_div_ps(_mm_sub_ps(mL, rays.o.ma[i]), rays.dir.ma[i]);
-		t2 = _mm_div_ps(_mm_sub_ps(mH, rays.o.ma[i]), rays.dir.ma[i]);
-
-		__m128 t = _mm_min_ps(t1, t2);
-		t2 = _mm_max_ps(t1, t2);
-		t1 = t;
+	origin = rays.o.ma[1];
+	invdir = rays.invdir.ma[1];
+	t1 = mmul(msub(mset1(L[1]), origin), invdir);
+	t2 = mmul(msub(mset1(H[1]), origin), invdir);
+	tmin = mmax(mmin(t1, t2), tmin);
+	tmax = mmin(mmax(t1, t2), tmax);
 
-		tnear = _mm_max_ps(tnear, t1);	/* want largest Tnear */
-		tfar = _mm_min_ps(tfar, t2);	/* want smallest Tfar */
+	origin = rays.o.ma[2];
+	invdir = rays.invdir.ma[2];
+	t1 = mmul(msub(mset1(L[2]), origin), invdir);
+	t2 = mmul(msub(mset1(H[2]), origin), invdir);
+	tmin = mmax(mmin(t1, t2), tmin);
+	tmax = mmin(mmax(t1, t2), tmax);
 
-		mask = _mm_and_ps(mask,
-			_mm_and_ps(_mm_cmple_ps(tnear, tfar), _mm_cmpge_ps(tfar, mZero)));
-		if (!_mm_movemask_ps(mask))
-			return mask;
-	}
-
-	a = tnear;
-	b = tfar;
-	return mask;
+	a = tmin;
+	b = tmax;
+	return mand(mcmplt(tmin, tmax), mcmpgt(tmax, mZero));
 }
 #endif
--- a/src/shapes.cc	Fri May 02 13:27:47 2008 +0200
+++ b/src/shapes.cc	Mon May 05 15:31:14 2008 +0200
@@ -54,30 +54,29 @@
 	return false;
 }
 
-#ifndef NO_SSE
-__m128 Sphere::intersect_packet(const RayPacket &rays, __m128 &dists)
+#ifndef NO_SIMD
+mfloat4 Sphere::intersect_packet(const RayPacket &rays, mfloat4 &dists) const
 {
 	VectorPacket V = rays.o - VectorPacket(center);
-	register __m128 d = _mm_sub_ps(mZero, dot(V, rays.dir));
-	register __m128 Det = _mm_sub_ps(_mm_mul_ps(d, d),
-		_mm_sub_ps(dot(V,V), _mm_set_ps1(sqr_radius)));
-	register __m128 t1, t2, mask;
+	register mfloat4 d = msub(mZero, dot(V, rays.dir));
+	register mfloat4 Det = msub(mmul(d, d), msub(dot(V,V), mset1(sqr_radius)));
+	register mfloat4 t1, t2, mask;
 
-	mask = _mm_cmpgt_ps(Det, mZero);
-	if (!_mm_movemask_ps(mask))
+	mask = mcmpgt(Det, mZero);
+	if (!mmovemask(mask))
 		return mask;
 
-	Det = _mm_sqrt_ps(Det);
-	t1 = _mm_sub_ps(d, Det);
-	t2 = _mm_add_ps(d, Det);
+	Det = msqrt(Det);
+	t1 = msub(d, Det);
+	t2 = madd(d, Det);
 
-	mask = _mm_and_ps(mask, _mm_cmpgt_ps(t2, mZero));
+	mask = mand(mask, mcmpgt(t2, mZero));
 
-	const __m128 cond1 = _mm_and_ps(_mm_cmpgt_ps(t1, mZero), _mm_cmplt_ps(t1, dists));
-	const __m128 cond2 = _mm_and_ps(_mm_cmple_ps(t1, mZero), _mm_cmplt_ps(t2, dists));
-	const __m128 newdists = _mm_or_ps(_mm_and_ps(cond1, t1), _mm_and_ps(cond2, t2));
-	mask = _mm_and_ps(mask, _mm_or_ps(cond1, cond2));
-	dists = _mm_or_ps(_mm_and_ps(mask, newdists), _mm_andnot_ps(mask, dists));
+	const mfloat4 cond1 = mand(mcmpgt(t1, mZero), mcmplt(t1, dists));
+	const mfloat4 cond2 = mand(mcmple(t1, mZero), mcmplt(t2, dists));
+	const mfloat4 newdists = mor(mand(cond1, t1), mand(cond2, t2));
+	mask = mand(mask, mor(cond1, cond2));
+	dists = mselect(mask, newdists, dists);
 	return mask;
 }
 #endif
@@ -177,45 +176,32 @@
 	return false;
 }
 
-#ifndef NO_SSE
-__m128 Box::intersect_packet(const RayPacket &rays, __m128 &dists)
+#ifndef NO_SIMD
+mfloat4 Box::intersect_packet(const RayPacket &rays, mfloat4 &dists) const
 {
-	register __m128 tnear = mZero;
-	register __m128 tfar = mInf;
-	register __m128 t1, t2;
-	register __m128 mask = mAllSet;
-
-	for (int i = 0; i < 3; i++)
-	{
-		const __m128 mL = _mm_set_ps1(L[i]);
-		const __m128 mH = _mm_set_ps1(H[i]);
-		mask = _mm_and_ps(mask,
-		_mm_or_ps(
-		_mm_or_ps(_mm_cmplt_ps(rays.dir.ma[i], mMEps), _mm_cmpgt_ps(rays.dir.ma[i], mEps)),
-		_mm_and_ps(_mm_cmpge_ps(rays.o.ma[i], mL), _mm_cmple_ps(rays.o.ma[i], mH))
-		));
-		if (!_mm_movemask_ps(mask))
-			return mask;
+	mfloat4 origin = rays.o.ma[0];
+	mfloat4 invdir = rays.invdir.ma[0];
+	mfloat4 t1 = mmul(msub(mset1(L[0]), origin), invdir);
+	mfloat4 t2 = mmul(msub(mset1(H[0]), origin), invdir);
+	mfloat4 tmin = mmin(t1, t2);
+	mfloat4 tmax = mmax(t1, t2);
 
-		/* compute the intersection distance of the planes */
-		t1 = _mm_div_ps(_mm_sub_ps(mL, rays.o.ma[i]), rays.dir.ma[i]);
-		t2 = _mm_div_ps(_mm_sub_ps(mH, rays.o.ma[i]), rays.dir.ma[i]);
-
-		__m128 t = _mm_min_ps(t1, t2);
-		t2 = _mm_max_ps(t1, t2);
-		t1 = t;
+	origin = rays.o.ma[1];
+	invdir = rays.invdir.ma[1];
+	t1 = mmul(msub(mset1(L[1]), origin), invdir);
+	t2 = mmul(msub(mset1(H[1]), origin), invdir);
+	tmin = mmax(mmin(t1, t2), tmin);
+	tmax = mmin(mmax(t1, t2), tmax);
 
-		tnear = _mm_max_ps(tnear, t1);	/* want largest Tnear */
-		tfar = _mm_min_ps(tfar, t2);	/* want smallest Tfar */
+	origin = rays.o.ma[2];
+	invdir = rays.invdir.ma[2];
+	t1 = mmul(msub(mset1(L[2]), origin), invdir);
+	t2 = mmul(msub(mset1(H[2]), origin), invdir);
+	tmin = mmax(mmin(t1, t2), tmin);
+	tmax = mmin(mmax(t1, t2), tmax);
 
-		mask = _mm_and_ps(mask,
-			_mm_and_ps(_mm_cmple_ps(tnear, tfar), _mm_cmpge_ps(tfar, mZero)));
-		if (!_mm_movemask_ps(mask))
-			return mask;
-	}
-
-	mask = _mm_and_ps(mask, _mm_cmplt_ps(tnear, dists));
-	dists = _mm_or_ps(_mm_and_ps(mask, tnear), _mm_andnot_ps(mask, dists));
+	mfloat4 mask = mand(mand(mcmplt(tmin, tmax), mcmpgt(tmax, mZero)), mcmplt(tmin, dists));
+	dists = mselect(mask, tmin, dists);
 	return mask;
 }
 #endif
@@ -419,48 +405,48 @@
 #endif
 }
 
-#if not defined(NO_SSE) and defined(TRI_BARI_PRE)
-__m128 Triangle::intersect_packet(const RayPacket &rays, __m128 &dists)
+#if !defined(NO_SIMD) && defined(TRI_BARI_PRE)
+mfloat4 Triangle::intersect_packet(const RayPacket &rays, mfloat4 &dists) const
 {
 	static const int modulo3[5] = {0,1,2,0,1};
 	register const int u = modulo3[k+1];
 	register const int v = modulo3[k+2];
-	__m128 mask;
+	mfloat4 mask;
 
-	const __m128 t = _mm_div_ps(
-		_mm_sub_ps(_mm_sub_ps(
-		_mm_sub_ps(_mm_set_ps1(nd), rays.o.ma[k]),
-		_mm_mul_ps(_mm_set_ps1(nu), rays.o.ma[u])
-		), _mm_mul_ps(_mm_set_ps1(nv), rays.o.ma[v])),
-		_mm_add_ps(rays.dir.ma[k],
-		_mm_add_ps(_mm_mul_ps(_mm_set_ps1(nu), rays.dir.ma[u]),
-		_mm_mul_ps(_mm_set_ps1(nv), rays.dir.ma[v])))
+	const mfloat4 t = mdiv(
+		msub(msub(
+		msub(mset1(nd), rays.o.ma[k]),
+		mmul(mset1(nu), rays.o.ma[u])
+		), mmul(mset1(nv), rays.o.ma[v])),
+		madd(rays.dir.ma[k],
+		madd(mmul(mset1(nu), rays.dir.ma[u]),
+		mmul(mset1(nv), rays.dir.ma[v])))
 		);
 
-	mask = _mm_and_ps(_mm_cmplt_ps(t, dists), _mm_cmpge_ps(t, mEps));
-	if (!_mm_movemask_ps(mask))
+	mask = mand(mcmplt(t, dists), mcmpge(t, mEps));
+	if (!mmovemask(mask))
 		return mask;
 
-	const __m128 hu = _mm_sub_ps(_mm_add_ps(rays.o.ma[u],
-		_mm_mul_ps(t, rays.dir.ma[u])), _mm_set_ps1(A->P[u]));
-	const __m128 hv = _mm_sub_ps(_mm_add_ps(rays.o.ma[v],
-		_mm_mul_ps(t, rays.dir.ma[v])), _mm_set_ps1(A->P[v]));
-	const __m128 beta = _mm_add_ps(_mm_mul_ps(hv, _mm_set_ps1(bnu)),
-		_mm_mul_ps(hu, _mm_set_ps1(bnv)));
+	const mfloat4 hu = msub(madd(rays.o.ma[u],
+		mmul(t, rays.dir.ma[u])), mset1(A->P[u]));
+	const mfloat4 hv = msub(madd(rays.o.ma[v],
+		mmul(t, rays.dir.ma[v])), mset1(A->P[v]));
+	const mfloat4 beta = madd(mmul(hv, mset1(bnu)),
+		mmul(hu, mset1(bnv)));
 
-	mask = _mm_and_ps(mask, _mm_cmpge_ps(beta, mZero));
-	if (!_mm_movemask_ps(mask))
+	mask = mand(mask, mcmpge(beta, mZero));
+	if (!mmovemask(mask))
 		return mask;
 
-	const __m128 gamma = _mm_add_ps(_mm_mul_ps(hu, _mm_set_ps1(cnv)),
-		_mm_mul_ps(hv, _mm_set_ps1(cnu)));
+	const mfloat4 gamma = madd(mmul(hu, mset1(cnv)),
+		mmul(hv, mset1(cnu)));
 
-	mask = _mm_and_ps(mask, _mm_and_ps(_mm_cmpge_ps(gamma, mZero),
-		_mm_cmple_ps(_mm_add_ps(beta, gamma), mOne)));
-	if (!_mm_movemask_ps(mask))
+	mask = mand(mask, mand(mcmpge(gamma, mZero),
+		mcmple(madd(beta, gamma), mOne)));
+	if (!mmovemask(mask))
 		return mask;
 
-	dists = _mm_or_ps(_mm_andnot_ps(mask, dists), _mm_and_ps(mask, t));
+	dists = mselect(mask, t, dists);
 	return mask;
 }
 #endif
@@ -580,13 +566,13 @@
 		v=v0[q];
 		if(N[q]>0.0f)
 		{
-			vmin.cell[q]=-boxhalfsize[q] - v;
-			vmax.cell[q]= boxhalfsize[q] - v;
+			vmin[q]=-boxhalfsize[q] - v;
+			vmax[q]= boxhalfsize[q] - v;
 		}
 		else
 		{
-			vmin.cell[q]= boxhalfsize[q] - v;
-			vmax.cell[q]=-boxhalfsize[q] - v;
+			vmin[q]= boxhalfsize[q] - v;
+			vmax[q]=-boxhalfsize[q] - v;
 		}
 	}
 	if(dot(N,vmin)>0.0f) return false;