packetize Phong shader pyrit
authorRadek Brich <radek.brich@devl.cz>
Fri, 02 May 2008 13:27:47 +0200
branchpyrit
changeset 91 9d66d323c354
parent 90 f6a72eb99631
child 92 9af5c039b678
packetize Phong shader new scons config options: simd=(yes|no) - allow/suppress explicit SSE force_flags=(yes|no) - force use of specified flags instead of autodetected profile=(yes|no) - enable gcc's profiling (-pg option) check for pthread.h header, don't try to build without it add fourth Vector3 component for better memory aligning rename Vector3 to Vector partialy SSE-ize Vector class (only fully vertical operations) build static lib and python module in distinctive directories to avoid collision of library file names on some platforms
SConstruct
TODO
ccdemos/SConscript
ccdemos/common_ply.h
ccdemos/realtime.cc
ccdemos/realtime_bunny.cc
ccdemos/realtime_dragon.cc
ccdemos/spheres_shadow.cc
ccdemos/textures.cc
include/common.h
include/container.h
include/kdtree.h
include/material.h
include/matrix.h
include/quaternion.h
include/raytracer.h
include/scene.h
include/serialize.h
include/shapes.h
include/vector.h
src/SConscript
src/container.cc
src/kdtree.cc
src/octree.cc
src/raytracer.cc
src/raytracermodule.cc
src/scene.cc
src/serialize.cc
src/shapes.cc
tests/vector.cc
--- a/SConstruct	Tue Apr 29 23:31:08 2008 +0200
+++ b/SConstruct	Fri May 02 13:27:47 2008 +0200
@@ -30,14 +30,17 @@
 """)
 
 import os, sys
-env = Environment(ENV = {'PATH' : os.environ['PATH']})
+env = Environment() #(ENV = {'PATH' : os.environ['PATH']})
 Decider('MD5-timestamp')
 
 opt = Options(['.optioncache'])
 opt.AddOptions(
 	BoolOption('intelc', 'use Intel C++ Compiler, if available', True),
+	BoolOption('simd', 'allow SSE intrinsics', True),
 	('precision', 'floating point number precision (single/double)', "single"),
 	('flags', 'add additional compiler flags', ""),
+	BoolOption('force_flags', "use only flags specified by 'flags' option (do not autodetect arch/sse flags)", False),
+	BoolOption('profile', "enable gcc's profiling support (-pg)", False),
 )
 if env['PLATFORM'] == 'win32':
 	opt.AddOptions(
@@ -120,12 +123,18 @@
 if cc == 'intelc':
 	add_flags += cpuflags_intelc + ' '
 
+if conf.env['force_flags']:
+	add_flags = conf.env['flags'] + ' '
+else:
+	add_flags += conf.env['flags'] + ' '
+
 if conf.env['precision'] == 'double':
 	add_flags += '-DPYRIT_DOUBLE '
 elif cc == 'gcc':
 	add_flags += '-fsingle-precision-constant '
 
-add_flags += conf.env['flags']
+if not conf.env['simd']:
+	add_flags += '-DNO_SSE '
 
 if cc == 'intelc':
 	conf.env.Append(CCFLAGS="-O3 -w1 " + add_flags)
@@ -143,17 +152,25 @@
 	conf.env.Append(CCFLAGS='-DHAVE_PNG')
 	conf.env.Append(LIBS=['png'])
 
+if not conf.CheckCHeader('pthread.h'):
+	print 'Error: Cannot build without pthread.'
+	Exit(1)
+
 if conf.env['PLATFORM'] == 'win32':
 	conf.env.Append(LIBS=["pthreadGC2"])
 else:
 	conf.env.Append(CCFLAGS="-pthread ")
 
+if conf.env['profile'] and cc == 'gcc':
+	conf.env.Append(CCFLAGS="-pg", LINKFLAGS="-pg")
+
 env = conf.Finish()
 
 
 ### build targets
 
-(lib, pymodule) = SConscript('src/SConscript', build_dir='build/lib', duplicate=0, exports='env')
+lib = SConscript('src/SConscript', build_dir='build/lib', duplicate=0, exports={'env':env,'buildmodule':False})
+pymodule = SConscript('src/SConscript', build_dir='build/pymodule', duplicate=0, exports={'env':env,'buildmodule':True})
 
 SConscript('ccdemos/SConscript', build_dir='build/ccdemos', duplicate=0, exports='env lib')
 SConscript('demos/SConscript', exports='pymodule')
@@ -163,6 +180,8 @@
 
 SConscript('models/SConscript')
 
+env.Alias('libs', ['static-lib', 'python-module'])
+
 env.Alias('docs', Command('docs/html', [], 'doxygen'))
 env.Clean('docs', ['docs/html'])
 
--- a/TODO	Tue Apr 29 23:31:08 2008 +0200
+++ b/TODO	Fri May 02 13:27:47 2008 +0200
@@ -4,9 +4,11 @@
 
 Future Plans
 ============
+ * Python binding for material.h classes
+ * Container should implement Shape interface to allow building hierarchies
+ * Container should allow local space transformations
  * generalization: Camera "shader" (ray generator), surface shader and maybe light & background shaders
- * namespace
- * Python binding for all classes
+ * put everything into a namespace
  * stochastic oversampling
  * absorbtion of refracted rays in dense materials (can be computed using shape distance and some 'absorbance' constant)
  * implement efficient AABB-ray intersection using Plucker coordinates
--- a/ccdemos/SConscript	Tue Apr 29 23:31:08 2008 +0200
+++ b/ccdemos/SConscript	Fri May 02 13:27:47 2008 +0200
@@ -4,7 +4,12 @@
 myenv.Prepend(LIBS=['pyrit'])
 
 sdlenv = myenv.Clone()
-sdlenv.ParseConfig('sh sdl-config --cflags --libs')
+try:
+	sdlenv.ParseConfig('sh sdl-config --cflags --libs')
+except:
+	print "SDL not found, some demos will not built."
+	myenv.Alias('cc-demos', None)
+	Return()
 
 l = []
 l.append( sdlenv.Program(['realtime.cc']) )
--- a/ccdemos/common_ply.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/ccdemos/common_ply.h	Fri May 02 13:27:47 2008 +0200
@@ -2,10 +2,10 @@
 #include <fstream>
 #include <iomanip>
 
-void load_ply(Raytracer &rt, const char *filename, Material *mat, Vector3 scale, Vector3 transp)
+void load_ply(Raytracer &rt, const char *filename, Material *mat, Vector scale, Vector transp)
 {
 	vector<NormalVertex*> vertices;
-	vector<Vector3> normals;
+	vector<Vector> normals;
 	vector<int> vertex_face_num;
 	ifstream f(filename);
 	string token = "a";
@@ -31,7 +31,7 @@
 	}
 
 	// read vertices
-	Vector3 P;
+	Vector P;
 	int num = vertex_num;
 	while (num--)
 	{
@@ -40,7 +40,7 @@
 		P.y = scale.y*P.y + transp.y;
 		P.z = scale.z*P.z + transp.z;
 		vertices.push_back(new NormalVertex(P));
-		normals.push_back(Vector3());
+		normals.push_back(Vector());
 		vertex_face_num.push_back(0);
 		f.ignore(1000,'\n');
 	}
--- a/ccdemos/realtime.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/ccdemos/realtime.cc	Fri May 02 13:27:47 2008 +0200
@@ -14,21 +14,21 @@
 	rt.setMaxDepth(3);
 	rt.setTop(&top);
 
-	Light light1(Vector3(2.0, -5.0, -5.0), Colour(0.7, 0.3, 0.6));
+	Light light1(Vector(2.0, -5.0, -5.0), Colour(0.7, 0.3, 0.6));
 	light1.castShadows(false);
 	rt.addLight(&light1);
 
-	Light light2(Vector3(-2.0, 10.0, 2.0), Colour(0.4, 0.6, 0.3));
+	Light light2(Vector(-2.0, 10.0, 2.0), Colour(0.4, 0.6, 0.3));
 	light2.castShadows(false);
 	rt.addLight(&light2);
 
 	Material mat_sph(Colour(1.0, 1.0, 1.0));
 	for (int y=0; y<10; y++)
 		for (int x=0; x<10; x++)
-			rt.addShape(new Sphere(Vector3(x*2-10, (Float)rand()/RAND_MAX*5.0, y*2-10), 0.45, &mat_sph));
+			rt.addShape(new Sphere(Vector(x*2-10, (Float)rand()/RAND_MAX*5.0, y*2-10), 0.45, &mat_sph));
 
 	rt.setCamera(&cam);
-	cam.setEye(Vector3(0,0,10));
+	cam.setEye(Vector(0,0,10));
 
 	top.optimize();
 
--- a/ccdemos/realtime_bunny.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/ccdemos/realtime_bunny.cc	Fri May 02 13:27:47 2008 +0200
@@ -13,20 +13,20 @@
 	rt.setMaxDepth(0);
 	rt.setTop(&top);
 
-	Light light1(Vector3(-5.0, 2.0, 8.0), Colour(0.9, 0.3, 0.6));
+	Light light1(Vector(-5.0, 2.0, 8.0), Colour(0.9, 0.3, 0.6));
 	light1.castShadows(false);
 	rt.addLight(&light1);
 
-	//Light light2(Vector3(-2.0, 10.0, 2.0), Colour(0.4, 0.6, 0.3));
+	//Light light2(Vector(-2.0, 10.0, 2.0), Colour(0.4, 0.6, 0.3));
 	//light2.castShadows(false);
 	//rt.addLight(&light2);
 
 	Material mat(Colour(0.9, 0.9, 0.9));
 	mat.setSmooth(true);
-	load_ply(rt, "../models/ply/bunny/bun_zipper.ply", &mat, Vector3(-29,29,29), Vector3(-1,-3,0));
+	load_ply(rt, "../models/ply/bunny/bun_zipper.ply", &mat, Vector(-29,29,29), Vector(-1,-3,0));
 
 	rt.setCamera(&cam);
-	cam.setEye(Vector3(0,0,10));
+	cam.setEye(Vector(0,0,10));
 
 	top.optimize();
 
--- a/ccdemos/realtime_dragon.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/ccdemos/realtime_dragon.cc	Fri May 02 13:27:47 2008 +0200
@@ -14,13 +14,13 @@
 	rt.setMaxDepth(0);
 	rt.setTop(&top);
 	rt.setCamera(&cam);
-	cam.setEye(Vector3(0,0,10));
+	cam.setEye(Vector(0,0,10));
 
-	Light light1(Vector3(-5.0, 2.0, 8.0), Colour(0.9, 0.3, 0.6));
+	Light light1(Vector(-5.0, 2.0, 8.0), Colour(0.9, 0.3, 0.6));
 	light1.castShadows(false);
 	rt.addLight(&light1);
 
-	//Light light2(Vector3(-2.0, 10.0, 2.0), Colour(0.4, 0.6, 0.3));
+	//Light light2(Vector(-2.0, 10.0, 2.0), Colour(0.4, 0.6, 0.3));
 	//light2.castShadows(false);
 	//rt.addlight(&light2);
 
@@ -34,7 +34,7 @@
 	if (!fin)
 	{
 		load_ply(rt, "../models/ply/dragon/dragon_vrip.ply",
-			&mat, Vector3(-29,29,-29), Vector3(0,-3.6,0));
+			&mat, Vector(-29,29,-29), Vector(0,-3.6,0));
 		top.optimize();
 	}
 	else
--- a/ccdemos/spheres_shadow.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/ccdemos/spheres_shadow.cc	Fri May 02 13:27:47 2008 +0200
@@ -4,7 +4,7 @@
 #include "common_sdl.h"
 
 Camera cam;
-Light light(Vector3(-2.0, 10.0, -2.0), Colour(0.9, 0.9, 0.9));
+Light light(Vector(-2.0, 10.0, -2.0), Colour(0.9, 0.9, 0.9));
 
 Float lx, ly, lz, cf;
 
@@ -67,34 +67,34 @@
 
 	Material mat0a(Colour(0.7, 0.7, 0.7));
 	mat0a. setReflectivity(0.0);
-	Box box(Vector3(-10.0, -1.2, -20.0), Vector3(10.0, -1.0, 0.0), &mat0a);
+	Box box(Vector(-10.0, -1.2, -20.0), Vector(10.0, -1.0, 0.0), &mat0a);
 	rt.addShape(&box);
 
 	Material mat0b(Colour(0.1, 0.7, 0.8));
 	mat0b.setReflectivity(0.7);
-	Box box2(Vector3(-10.0, -1.2, -20.0), Vector3(10.0, 10.0, -20.2), &mat0b);
+	Box box2(Vector(-10.0, -1.2, -20.0), Vector(10.0, 10.0, -20.2), &mat0b);
 	rt.addShape(&box2);
 
 	Material mat1(Colour(1.0, 0.0, 0.0));
-	Sphere bigsphere(Vector3(3.0, 2.0, -7.0), 3.0, &mat1);
+	Sphere bigsphere(Vector(3.0, 2.0, -7.0), 3.0, &mat1);
 	rt.addShape(&bigsphere);
 
 	Material mat2(Colour(0.0, 1.0, 0.0));
-	Sphere smallsphere(Vector3(-5.5, 1.5, -8.0), 2.0, &mat2);
+	Sphere smallsphere(Vector(-5.5, 1.5, -8.0), 2.0, &mat2);
 	rt.addShape(&smallsphere);
 
 	Material mat3(Colour(0.0, 0.0, 1.0));
 	mat3.setReflectivity(0.1);
 	mat3.setTransmissivity(0.8, 1.5);
-	Sphere tinysphere(Vector3(-1.2, 0.0, -2.0), 0.7, &mat3);
+	Sphere tinysphere(Vector(-1.2, 0.0, -2.0), 0.7, &mat3);
 	rt.addShape(&tinysphere);
 
 	top.optimize();
 
-	cam.setEye(Vector3(-2.28908, 4.30992, 12.3051));
-	cam.p = Vector3(0.0988566, -0.139543, -0.985269);
-	cam.u = Vector3(-0.995004, 0, -0.0998334);
-	cam.v = Vector3(0.0139311, 0.990216, -0.138846);
+	cam.setEye(Vector(-2.28908, 4.30992, 12.3051));
+	cam.p = Vector(0.0988566, -0.139543, -0.985269);
+	cam.u = Vector(-0.995004, 0, -0.0998334);
+	cam.v = Vector(0.0139311, 0.990216, -0.138846);
 	rt.setCamera(&cam);
 
 	w = 800;
--- a/ccdemos/textures.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/ccdemos/textures.cc	Fri May 02 13:27:47 2008 +0200
@@ -1,10 +1,10 @@
 #include "raytracer.h"
-#include "octree.h"
+#include "kdtree.h"
 
 #include "common_sdl.h"
 
-Camera cam(Vector3(0.,6.,6.), Vector3(0.,2.,-7.), Vector3(0.,0.,-1.));
-Light light(Vector3(-2.0, 10.0, -2.0), Colour(0.9, 0.9, 0.9));
+Camera cam(Vector(0.,6.,6.), Vector(0.,2.,-7.), Vector(0.,0.,-1.));
+Light light(Vector(-2.0, 10.0, -2.0), Colour(0.9, 0.9, 0.9));
 
 Float lx, ly, lz, cf;
 
@@ -56,7 +56,7 @@
 {
 	Raytracer rt;
 
-	Octree top;
+	KdTree top;
 	rt.setCamera(&cam);
 	rt.setTop(&top);
 
@@ -65,12 +65,12 @@
 
 	Material mat0a(Colour(0.7, 0.7, 0.7));
 	mat0a.setReflectivity(0.0);
-	Box box(Vector3(-12.0, -1.2, -20.0), Vector3(12.0, -1.0, 0.0), &mat0a);
+	Box box(Vector(-12.0, -1.2, -20.0), Vector(12.0, -1.0, 0.0), &mat0a);
 	rt.addShape(&box);
 
 	Material mat0b(Colour(0.1, 0.7, 0.8));
 	mat0b.setReflectivity(0.7);
-	Box box2(Vector3(-12.0, -1.2, -10.0), Vector3(12.0, 10.0, -10.2), &mat0b);
+	Box box2(Vector(-12.0, -1.2, -10.0), Vector(12.0, 10.0, -10.2), &mat0b);
 	rt.addShape(&box2);
 
 	Float bounds[] = {0.3, 0.6, 1.1};
@@ -79,44 +79,44 @@
 
 	// spheres
 	Material mat1(Colour(1.0, 1.0, 1.0));
-	mat1.texture = new CheckersTexture(new PlanarMap(Vector3(-4.5, 2.0, -7.0), 0.48), &cmap);
-	Sphere sphere1(Vector3(-4.5, 2.0, -7.0), 1.0, &mat1);
+	mat1.texture = new CheckersTexture(new PlanarMap(Vector(-4.5, 2.0, -7.0), 0.48), &cmap);
+	Sphere sphere1(Vector(-4.5, 2.0, -7.0), 1.0, &mat1);
 	rt.addShape(&sphere1);
 
 	Material mat2(Colour(1.0, 1.0, 1.0));
-	mat2.texture = new CheckersTexture(new CubicMap(Vector3(-1.5, 2.0, -7.0), 0.48), &cmap);
-	Sphere sphere2(Vector3(-1.5, 2.0, -7.0), 1.0, &mat2);
+	mat2.texture = new CheckersTexture(new CubicMap(Vector(-1.5, 2.0, -7.0), 0.48), &cmap);
+	Sphere sphere2(Vector(-1.5, 2.0, -7.0), 1.0, &mat2);
 	rt.addShape(&sphere2);
 
 	Material mat3(Colour(1.0, 1.0, 1.0));
-	mat3.texture = new CheckersTexture(new CylinderMap(Vector3(1.5, 2.0, -7.0), 0.48), &cmap);
-	Sphere sphere3(Vector3(1.5, 2.0, -7.0), 1.0, &mat3);
+	mat3.texture = new CheckersTexture(new CylinderMap(Vector(1.5, 2.0, -7.0), 0.48), &cmap);
+	Sphere sphere3(Vector(1.5, 2.0, -7.0), 1.0, &mat3);
 	rt.addShape(&sphere3);
 
 	Material mat4(Colour(1.0, 1.0, 1.0));
-	mat4.texture = new CheckersTexture(new SphereMap(Vector3(4.5, 2.0, -7.0), 0.48), &cmap);
-	Sphere sphere4(Vector3(4.5, 2.0, -7.0), 1.0, &mat4);
+	mat4.texture = new CheckersTexture(new SphereMap(Vector(4.5, 2.0, -7.0), 0.48), &cmap);
+	Sphere sphere4(Vector(4.5, 2.0, -7.0), 1.0, &mat4);
 	rt.addShape(&sphere4);
 
 	// cubes
 	Material mat5(Colour(1.0, 1.0, 1.0));
-	mat5.texture = new CheckersTexture(new PlanarMap(Vector3(-4.5, 0.0, -7.0), 0.48), &cmap);
-	Box cube1(Vector3(-4.5, 0.0, -7.0)-1.0, Vector3(-4.5, 0.0, -7.0)+1.0, &mat5);
+	mat5.texture = new CheckersTexture(new PlanarMap(Vector(-4.5, 0.0, -7.0), 0.48), &cmap);
+	Box cube1(Vector(-4.5, 0.0, -7.0)-1.0, Vector(-4.5, 0.0, -7.0)+1.0, &mat5);
 	rt.addShape(&cube1);
 
 	Material mat6(Colour(1.0, 1.0, 1.0));
-	mat6.texture = new CheckersTexture(new CubicMap(Vector3(-1.5, 0.0, -7.0), 0.48), &cmap);
-	Box cube2(Vector3(-1.5, 0.0, -7.0)-1.0, Vector3(-1.5, 0.0, -7.0)+1.0, &mat6);
+	mat6.texture = new CheckersTexture(new CubicMap(Vector(-1.5, 0.0, -7.0), 0.48), &cmap);
+	Box cube2(Vector(-1.5, 0.0, -7.0)-1.0, Vector(-1.5, 0.0, -7.0)+1.0, &mat6);
 	rt.addShape(&cube2);
 
 	Material mat7(Colour(1.0, 1.0, 1.0));
-	mat7.texture = new CheckersTexture(new CylinderMap(Vector3(1.5, 0.0, -7.0), 0.48), &cmap);
-	Box cube3(Vector3(1.5, 0.0, -7.0)-1.0, Vector3(1.5, 0.0, -7.0)+1.0, &mat7);
+	mat7.texture = new CheckersTexture(new CylinderMap(Vector(1.5, 0.0, -7.0), 0.48), &cmap);
+	Box cube3(Vector(1.5, 0.0, -7.0)-1.0, Vector(1.5, 0.0, -7.0)+1.0, &mat7);
 	rt.addShape(&cube3);
 
 	Material mat8(Colour(1.0, 1.0, 1.0));
-	mat8.texture = new CheckersTexture(new SphereMap(Vector3(4.5, 0.0, -7.0), 0.48), &cmap);
-	Box cube4(Vector3(4.5, 0.0, -7.0)-1.0, Vector3(4.5, 0.0, -7.0)+1.0, &mat8);
+	mat8.texture = new CheckersTexture(new SphereMap(Vector(4.5, 0.0, -7.0), 0.48), &cmap);
+	Box cube4(Vector(4.5, 0.0, -7.0)-1.0, Vector(4.5, 0.0, -7.0)+1.0, &mat8);
 	rt.addShape(&cube4);
 
 	mat1.setReflectivity(0);
--- a/include/common.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/common.h	Fri May 02 13:27:47 2008 +0200
@@ -33,9 +33,6 @@
 #include <pthread.h>
 #include <string>
 
-// SSE intrinsics
-#include <xmmintrin.h>
-
 using namespace std;
 
 #ifdef PYRIT_DOUBLE
@@ -48,13 +45,27 @@
 # define Inf FLT_MAX
 #endif
 
+#ifndef NO_SSE
+// SSE intrinsics
+#include <xmmintrin.h>
+
 const __m128 mZero = _mm_set_ps1(0.);
 const __m128 mOne = _mm_set_ps1(1.);
+const __m128 mTwo = _mm_set_ps1(2.);
 const __m128 mEps = _mm_set_ps1(Eps);
 const __m128 mMEps = _mm_set_ps1(-Eps);
 const __m128 mInf = _mm_set_ps1(Inf);
 const __m128 mAllSet = _mm_cmplt_ps(mZero, mOne);
 
+inline const __m128 _mm_fastpow(const __m128& base, const __m128& exponent)
+{
+    __m128 denom = _mm_mul_ps( exponent, base);
+    denom = _mm_sub_ps( exponent, denom);
+    denom = _mm_add_ps( base, denom);
+    return _mm_mul_ps( base, _mm_rcp_ps(denom));
+}
+#endif
+
 /* verbosity level:
 0: only errors (E)
 1: major status messages (*)
--- a/include/container.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/container.h	Fri May 02 13:27:47 2008 +0200
@@ -3,7 +3,7 @@
  *
  * This file is part of Pyrit Ray Tracer.
  *
- * Copyright 2007  Radek Brich
+ * Copyright 2007, 2008  Radek Brich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -48,14 +48,17 @@
 	//void addShapeNoExtend(Shape* aShape) { shapes.push_back(aShape); };
 	virtual Shape *nearest_intersection(const Shape *origin_shape, const Ray &ray,
 		Float &nearest_distance);
-	virtual void packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
-		Float *nearest_distances, Shape **nearest_shapes);
 
 	virtual void optimize() {};
 
 	ShapeList & getShapes() { return shapes; };
 
 	virtual ostream & dump(ostream &st);
+
+#ifndef NO_SSE
+	virtual void packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
+		Float *nearest_distances, Shape **nearest_shapes);
+#endif
 };
 
 #endif
--- a/include/kdtree.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/kdtree.h	Fri May 02 13:27:47 2008 +0200
@@ -86,8 +86,10 @@
 	void addShape(Shape* aShape) { Container::addShape(aShape); built = false; };
 	Shape *nearest_intersection(const Shape *origin_shape, const Ray &ray,
 		Float &nearest_distance);
+#ifndef NO_SSE
 	void packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
 		Float *nearest_distances, Shape **nearest_shapes);
+#endif
 	void optimize() { build(); };
 	void build();
 	bool isBuilt() const { return built; };
--- a/include/material.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/material.h	Fri May 02 13:27:47 2008 +0200
@@ -43,7 +43,7 @@
 {
 public:
 	virtual ~Texture() {};
-	virtual Colour evaluate(const Vector3 &point) = 0;
+	virtual Colour evaluate(const Vector &point) = 0;
 };
 
 /**
@@ -103,13 +103,13 @@
 class TextureMap
 {
 protected:
-	Vector3 center;
+	Vector center;
 	Float invsize;
 public:
-	TextureMap(const Vector3 &acenter, const Float &size):
+	TextureMap(const Vector &acenter, const Float &size):
 		center(acenter), invsize(1./size) {};
 	virtual ~TextureMap() {};
-	virtual void map(const Vector3 &point, Float &u, Float &v) = 0;
+	virtual void map(const Vector &point, Float &u, Float &v) = 0;
 };
 
 /**
@@ -118,11 +118,11 @@
 class PlanarMap: public TextureMap
 {
 public:
-	PlanarMap(const Vector3 &acenter, const Float &size):
+	PlanarMap(const Vector &acenter, const Float &size):
 		TextureMap(acenter, size) {};
-	void map(const Vector3 &point, Float &u, Float &v)
+	void map(const Vector &point, Float &u, Float &v)
 	{
-		const Vector3 p = point - center;
+		const Vector p = point - center;
 		u = p.x*invsize;
 		v = p.y*invsize;
 	};
@@ -134,11 +134,11 @@
 class CubicMap: public TextureMap
 {
 public:
-	CubicMap(const Vector3 &acenter, const Float &size):
+	CubicMap(const Vector &acenter, const Float &size):
 		TextureMap(acenter, size) {};
-	void map(const Vector3 &point, Float &u, Float &v)
+	void map(const Vector &point, Float &u, Float &v)
 	{
-		const Vector3 p = point - center;
+		const Vector p = point - center;
 		if (fabs(p.x) > fabs(p.y))
 		{
 			if (fabs(p.x) > fabs(p.z))
@@ -188,11 +188,11 @@
 class CylinderMap: public TextureMap
 {
 public:
-	CylinderMap(const Vector3 &acenter, const Float &size):
+	CylinderMap(const Vector &acenter, const Float &size):
 		TextureMap(acenter, size) {};
-	void map(const Vector3 &point, Float &u, Float &v)
+	void map(const Vector &point, Float &u, Float &v)
 	{
-		const Vector3 p = point - center;
+		const Vector p = point - center;
 		u = ( M_PI + atan2(p.z, p.x) ) * invsize;
 		v = p.y * invsize;
 	};
@@ -204,11 +204,11 @@
 class SphereMap: public TextureMap
 {
 public:
-	SphereMap(const Vector3 &acenter, const Float &size):
+	SphereMap(const Vector &acenter, const Float &size):
 		TextureMap(acenter, size) {};
-	void map(const Vector3 &point, Float &u, Float &v)
+	void map(const Vector &point, Float &u, Float &v)
 	{
-		const Vector3 p = point - center;
+		const Vector p = point - center;
 		u = ( M_PI + atan2(p.z, p.x) ) * invsize;
 		v = acos(p.y / p.mag()) * invsize;
 	};
@@ -234,7 +234,7 @@
 public:
 	ImageTexture(TextureMap *tmap, Pixmap *image):
 		Texture2D(tmap), pixmap(image) {};
-	Colour evaluate(const Vector3 &point)
+	Colour evaluate(const Vector &point)
 	{
 		Float u,v;
 		map->map(point, u,v);
@@ -255,7 +255,7 @@
 public:
 	CheckersTexture(TextureMap *tmap, ColourMap *cmap):
 		Texture2D(tmap), colourmap(cmap) {};
-	Colour evaluate(const Vector3 &point)
+	Colour evaluate(const Vector &point)
 	{
 		Float u,v, val;
 		map->map(point, u,v);
@@ -274,7 +274,7 @@
 public:
 	CloudTexture(const Float &adetail, ColourMap *cmap):
 		detail(adetail), colourmap(cmap) {};
-	Colour evaluate(const Vector3 &point)
+	Colour evaluate(const Vector &point)
 	{
 		Float sum = 0.0;
 		for (int i = 1; i < detail; i++)
--- a/include/matrix.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/matrix.h	Fri May 02 13:27:47 2008 +0200
@@ -3,7 +3,7 @@
  *
  * This file is part of Pyrit Ray Tracer.
  *
- * Copyright 2006, 2007  Radek Brich
+ * Copyright 2006, 2007, 2008  Radek Brich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -98,9 +98,9 @@
 	friend Matrix operator*(const Float &f, Matrix &m) { return m * f; };
 
 	// product of matrix and vector
-	Vector3 operator*(const Vector3 &v)
+	Vector operator*(const Vector &v)
 	{
-		Vector3 u = Vector3();
+		Vector u = Vector();
 		u.x = data[0][0] * v.x + data[0][1] * v.y + data[0][2] * v.z + data[0][3] * v.w;
 		u.y = data[1][0] * v.x + data[1][1] * v.y + data[1][2] * v.z + data[1][3] * v.w;
 		u.z = data[2][0] * v.x + data[2][1] * v.y + data[2][2] * v.z + data[2][3] * v.w;
@@ -108,7 +108,7 @@
 		return u;
 	}
 
-	friend Matrix operator*(const Vector3 &v, Matrix &m) { return m * v; };
+	friend Matrix operator*(const Vector &v, Matrix &m) { return m * v; };
 };
 
 #endif
--- a/include/quaternion.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/quaternion.h	Fri May 02 13:27:47 2008 +0200
@@ -3,7 +3,7 @@
  *
  * This file is part of Pyrit Ray Tracer.
  *
- * Copyright 2007  Radek Brich
+ * Copyright 2007, 2008  Radek Brich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -44,9 +44,9 @@
 	Quaternion(): a(0), b(0), c(0), d(0) {};
 	Quaternion(const Float aa, const Float ab, const Float ac, const Float ad):
 		a(aa), b(ab), c(ac), d(ad) {};
-	Quaternion(const Vector3& v): a(0), b(v.x), c(v.y), d(v.z) {};
+	Quaternion(const Vector& v): a(0), b(v.x), c(v.y), d(v.z) {};
 
-	Vector3 toVector() { return Vector3(b, c, d); };
+	Vector toVector() { return Vector(b, c, d); };
 
 	Quaternion normalize()
 	{
--- a/include/raytracer.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/raytracer.h	Fri May 02 13:27:47 2008 +0200
@@ -39,7 +39,7 @@
 struct RenderrowData {
 	Raytracer *rt;
 	int w;
-	Vector3 eye, dfix, dx, dy;
+	Vector eye, dfix, dx, dy;
 	Float *iter;
 };
 
@@ -59,17 +59,22 @@
 	int max_depth;
 	bool use_packets;
 
-	Vector3 SphereDistribute(int i, int n, Float extent, Vector3 &normal);
-
 	Sample *sample_queue;
 	int sample_queue_pos, sample_queue_size, sample_queue_count;
 	bool end_of_samples;
 	pthread_mutex_t sample_queue_mutex, sampler_mutex;
 	pthread_cond_t sample_queue_cond, worker_ready_cond;
 
-	Colour shader_evalulate(const Ray &ray, int depth, Shape *origin_shape,
-		Float nearest_distance, Shape *nearest_shape);
+	Vector SphereDistribute(int i, int n, Float extent, const Vector &normal);
+	Colour PhongShader(const Shape *shape,
+		const Vector &P, const Vector &N, const Vector &V);
+	void lightScatter(const Ray &ray, const Shape *shape, int depth,
+		const Vector &P, const Vector &normal, bool from_inside, Colour &col);
+#ifndef NO_SSE
+	VectorPacket PhongShader_packet(const Shape **shapes,
+		const VectorPacket &P, const VectorPacket &N, const VectorPacket &V);
 	void raytracePacket(RayPacket &rays, Colour *results);
+#endif
 	static void *raytrace_worker(void *d);
 public:
 	Raytracer(): top(NULL), camera(NULL), lights(), bg_colour(0., 0., 0.),
@@ -89,7 +94,7 @@
 	}
 
 	void render();
-	Colour raytrace(Ray &ray, int depth, Shape *origin_shape);
+	Colour raytrace(Ray &ray, int depth, const Shape *origin_shape);
 	void addShape(Shape *shape) { top->addShape(shape); };
 	void addLight(Light *light) { lights.push_back(light); };
 	void setSampler(Sampler *sampl) { sampler = sampl; };
--- a/include/scene.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/scene.h	Fri May 02 13:27:47 2008 +0200
@@ -41,12 +41,13 @@
 class Ray
 {
 public:
-	Vector3 o, dir;
+	Vector o, dir;
 	Ray(): o(), dir() {};
-	Ray(const Vector3 &ao, const Vector3 &adir):
+	Ray(const Vector &ao, const Vector &adir):
 		o(ao), dir(adir) {};
 };
 
+#ifndef NO_SSE
 /**
  * packet of 4 rays
  */
@@ -55,14 +56,19 @@
 public:
 	VectorPacket o, dir;
 
+	RayPacket(): o(), dir() {};
+	RayPacket(const VectorPacket &ao, const VectorPacket &adir):
+		o(ao), dir(adir) {};
+
 	// index operator - get a ray
 	Ray operator[](int i) const
 	{
 		return Ray(
-			Vector3(o.x[i], o.y[i], o.z[i]),
-			Vector3(dir.x[i], dir.y[i], dir.z[i]));
+			Vector(o.x[i], o.y[i], o.z[i]),
+			Vector(dir.x[i], dir.y[i], dir.z[i]));
 	};
 };
+#endif
 
 /**
  * a camera
@@ -70,31 +76,32 @@
 class Camera
 {
 public:
-	Vector3 eye, p, u, v;
+	Vector eye, p, u, v;
 	Float F;
 
 	Camera(): eye(0,0,10), p(0,0,-1), u(-1,0,0), v(0,1,0), F(2.*tan(M_PI/8.)) {};
-	Camera(const Vector3 &C, const Vector3 &ap, const Vector3 &au, const Vector3 &av):
+	Camera(const Vector &C, const Vector &ap, const Vector &au, const Vector &av):
 		eye(C), p(ap), u(au), v(av), F(2.*tan(M_PI/8.)) {};
-	Camera(const Vector3 &from, const Vector3 &lookat, const Vector3 &up):
+	Camera(const Vector &from, const Vector &lookat, const Vector &up):
 		eye(from), F(2.*tan(M_PI/8.))
 	{
 		p = lookat - from; u = cross(up, p);
 		p.normalize(); u.normalize();
 		v = cross(p, u);
 	};
-	void setEye(const Vector3 &aeye) { eye = aeye; };
+	void setEye(const Vector &aeye) { eye = aeye; };
 	void setAngle(const Float angle) { F = 2.*tan(angle/2.); };
 	void rotate(const Quaternion &q);
 	void move(const Float fw, const Float left, const Float up);
 
 	Ray makeRay(Sample &samp)
 	{
-		Vector3 dir = p - (u*samp.x + v*samp.y)*F;
+		Vector dir = p - (u*samp.x + v*samp.y)*F;
 		dir.normalize();
 		return Ray(eye, dir);
 	};
 
+#ifndef NO_SSE
 	void makeRayPacket(Sample *samples, RayPacket &rays)
 	{
 		__m128 m1x,m1y,m1z;
@@ -145,6 +152,7 @@
 
 		rays.dir.normalize();
 	};
+#endif
 };
 
 /**
@@ -153,13 +161,13 @@
 class Light
 {
 public:
-	Vector3 pos;
+	Vector pos;
 	Colour colour;
 	bool cast_shadows;
 
 	Light():
-		pos(Vector3(0,0,0)), colour(Colour(1,1,1)), cast_shadows(true) {};
-	Light(const Vector3 &position, const Colour &acolour):
+		pos(Vector(0,0,0)), colour(Colour(1,1,1)), cast_shadows(true) {};
+	Light(const Vector &position, const Colour &acolour):
 		pos(position), colour(acolour), cast_shadows(true) {};
 	void castShadows(bool cast) { cast_shadows = cast; };
 };
@@ -170,15 +178,17 @@
 class BBox
 {
 public:
-	Vector3 L;
-	Vector3 H;
+	Vector L;
+	Vector H;
 	BBox(): L(), H() {};
-	BBox(const Vector3 aL, const Vector3 aH): L(aL), H(aH) {};
+	BBox(const Vector aL, const Vector aH): L(aL), H(aH) {};
 	Float w() { return H.x-L.x; };
 	Float h() { return H.y-L.y; };
 	Float d() { return H.z-L.z; };
 	bool intersect(const Ray &ray, Float &a, Float &b);
+#ifndef NO_SSE
 	__m128 intersect_packet(const RayPacket &rays, __m128 &a, __m128 &b);
+#endif
 };
 
 #endif
--- a/include/serialize.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/serialize.h	Fri May 02 13:27:47 2008 +0200
@@ -52,6 +52,6 @@
 ostream & operator<<(ostream &st, Shape &o);
 ostream & operator<<(ostream &st, Vertex &o);
 ostream & operator<<(ostream &st, Container &o);
-istream & operator>>(istream &st, Vector3 &v);
+istream & operator>>(istream &st, Vector &v);
 
 #endif
--- a/include/shapes.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/shapes.h	Fri May 02 13:27:47 2008 +0200
@@ -55,6 +55,7 @@
 	// first intersection point
 	virtual bool intersect(const Ray &ray, Float &dist) const = 0;
 
+#ifndef NO_SSE
 	virtual __m128 intersect_packet(const RayPacket &rays, __m128 &dists)
 	{
 		__m128 results;
@@ -64,6 +65,7 @@
 		((int*)&results)[3] = intersect(rays[3], ((float*)&dists)[3]) ? -1 : 0;
 		return results;
 	};
+#endif
 
 	// all intersections (only for CSG)
 	virtual bool intersect_all(const Ray &ray, Float dist, vector<Float> &allts) const = 0;
@@ -72,7 +74,7 @@
 	virtual bool intersect_bbox(const BBox &bbox) const = 0;
 
 	// normal at point P
-	virtual const Vector3 normal(const Vector3 &P) const = 0;
+	virtual const Vector normal(const Vector &P) const = 0;
 
 	virtual BBox get_bbox() const = 0;
 
@@ -89,25 +91,27 @@
  */
 class Sphere: public Shape
 {
-	Vector3 center;
+	Vector center;
 	Float radius;
 
 	Float sqr_radius;
 	Float inv_radius;
 public:
-	Sphere(const Vector3 &acenter, const Float aradius, Material *amaterial):
+	Sphere(const Vector &acenter, const Float aradius, Material *amaterial):
 		center(acenter), radius(aradius),
 		sqr_radius(aradius*aradius), inv_radius(1.0f/aradius)
 		{ material = amaterial; }
 	bool intersect(const Ray &ray, Float &dist) const;
-	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
 	bool intersect_all(const Ray &ray, Float dist, vector<Float> &allts) const;
 	bool intersect_bbox(const BBox &bbox) const;
-	const Vector3 normal(const Vector3 &P) const { return (P - center) * inv_radius; };
+	const Vector normal(const Vector &P) const { return (P - center) * inv_radius; };
 	BBox get_bbox() const;
-	const Vector3 getCenter() const { return center; };
+	const Vector getCenter() const { return center; };
 	Float getRadius() const { return radius; };
 	ostream & dump(ostream &st) const;
+#ifndef NO_SSE
+	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
+#endif
 };
 
 /**
@@ -115,10 +119,10 @@
  */
 class Box: public Shape
 {
-	Vector3 L;
-	Vector3 H;
+	Vector L;
+	Vector H;
 public:
-	Box(const Vector3 &aL, const Vector3 &aH, Material *amaterial): L(aL), H(aH)
+	Box(const Vector &aL, const Vector &aH, Material *amaterial): L(aL), H(aH)
 	{
 		for (int i = 0; i < 3; i++)
 			if (L.cell[i] > H.cell[i])
@@ -126,14 +130,16 @@
 		material = amaterial;
 	};
 	bool intersect(const Ray &ray, Float &dist) const;
-	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
 	bool intersect_all(const Ray &ray, Float dist, vector<Float> &allts) const { return false; };
 	bool intersect_bbox(const BBox &bbox) const;
-	const Vector3 normal(const Vector3 &P) const;
+	const Vector normal(const Vector &P) const;
 	BBox get_bbox() const { return BBox(L, H); };
-	const Vector3 getL() const { return L; };
-	const Vector3 getH() const { return H; };
+	const Vector getL() const { return L; };
+	const Vector getH() const { return H; };
 	ostream & dump(ostream &st) const;
+#ifndef NO_SSE
+	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
+#endif
 };
 
 /**
@@ -142,8 +148,8 @@
 class Vertex
 {
 public:
-	Vector3 P;
-	Vertex(const Vector3 &aP): P(aP) {};
+	Vector P;
+	Vertex(const Vector &aP): P(aP) {};
 	virtual ~Vertex() {};
 	virtual ostream & dump(ostream &st) const;
 };
@@ -154,12 +160,12 @@
 class NormalVertex: public Vertex
 {
 public:
-	Vector3 N;
+	Vector N;
 	NormalVertex(const NormalVertex *v): Vertex(v->P), N(v->N) {};
-	NormalVertex(const Vector3 &aP): Vertex(aP) {};
-	NormalVertex(const Vector3 &aP, const Vector3 &aN): Vertex(aP), N(aN) {};
-	const Vector3 &getNormal() { return N; };
-	void setNormal(const Vector3 &aN) { N = aN; };
+	NormalVertex(const Vector &aP): Vertex(aP) {};
+	NormalVertex(const Vector &aP, const Vector &aN): Vertex(aP), N(aN) {};
+	const Vector &getNormal() { return N; };
+	void setNormal(const Vector &aN) { N = aN; };
 	ostream & dump(ostream &st) const;
 };
 
@@ -180,13 +186,13 @@
 #ifdef TRI_PLUCKER
 	Float pla[6], plb[6], plc[6];
 #endif
-	Vector3 N;
-	const Vector3 smooth_normal(const Vector3 &P) const
+	Vector N;
+	const Vector smooth_normal(const Vector &P) const
 	{
 #ifdef TRI_BARI_PRE
-		const Vector3 &NA = static_cast<NormalVertex*>(A)->N;
-		const Vector3 &NB = static_cast<NormalVertex*>(B)->N;
-		const Vector3 &NC = static_cast<NormalVertex*>(C)->N;
+		const Vector &NA = static_cast<NormalVertex*>(A)->N;
+		const Vector &NB = static_cast<NormalVertex*>(B)->N;
+		const Vector &NC = static_cast<NormalVertex*>(C)->N;
 		static const int modulo3[5] = {0,1,2,0,1};
 		register const int ku = modulo3[k+1];
 		register const int kv = modulo3[k+2];
@@ -194,7 +200,7 @@
 		const Float pv = P[kv] - A->P[kv];
 		const Float u = pv * bnu + pu * bnv;
 		const Float v = pu * cnv + pv * cnu;
-		Vector3 n = NA + u * (NB - NA) + v * (NC - NA);
+		Vector n = NA + u * (NB - NA) + v * (NC - NA);
 		n.normalize();
 		return n;
 #else
@@ -207,15 +213,15 @@
 	Triangle() {};
 	Triangle(Vertex *aA, Vertex *aB, Vertex *aC, Material *amaterial);
 	bool intersect(const Ray &ray, Float &dist) const;
-#ifdef TRI_BARI_PRE
-	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
-#endif
 	bool intersect_all(const Ray &ray, Float dist, vector<Float> &allts) const {return false;};
 	bool intersect_bbox(const BBox &bbox) const;
-	const Vector3 normal(const Vector3 &P) const { return (material->smooth ? smooth_normal(P) : N); };
-	const Vector3 getNormal() const { return N; };
+	const Vector normal(const Vector &P) const { return (material->smooth ? smooth_normal(P) : N); };
+	const Vector getNormal() const { return N; };
 	BBox get_bbox() const;
 	ostream & dump(ostream &st) const;
+#if not defined(NO_SSE) and defined(TRI_BARI_PRE)
+	__m128 intersect_packet(const RayPacket &rays, __m128 &dists);
+#endif
 };
 
 template <class T> class Array
--- a/include/vector.h	Tue Apr 29 23:31:08 2008 +0200
+++ b/include/vector.h	Fri May 02 13:27:47 2008 +0200
@@ -1,5 +1,5 @@
 /*
- * vector.h: Vector3 class with Colour alias
+ * vector.h: Vector class with Colour alias
  *
  * This file is part of Pyrit Ray Tracer.
  *
@@ -37,145 +37,177 @@
 /**
  * three cell vector
  */
-class Vector3
+class Vector
 {
 public:
 	// data
 	union {
-		struct {
-			Float x, y, z;
-		};
-		struct {
-			Float r, g, b;
-		};
-		Float cell[3];
+#ifndef NO_SSE
+		__m128 mps;
+#endif
+		Float cell[4];
+		struct { Float x, y, z, w; };
+		struct { Float r, g, b, a; };
 	};
 
 	// constructors
-	Vector3(): x(0.0f), y(0.0f), z(0.0f) {};
-	Vector3(Float ax, Float ay, Float az): x(ax), y(ay), z(az) {};
+#ifndef NO_SSE
+	Vector(__m128 m): mps(m) {};
+#endif
+	Vector(): x(0.0f), y(0.0f), z(0.0f), w(1.0) {};
+	Vector(Float ax, Float ay, Float az): x(ax), y(ay), z(az), w(1.0) {};
 
 	// index operator
 	const Float &operator[](int index) const { return cell[index]; };
 
-	bool operator==(Vector3 &v) const { return x==v.x && y==v.y && z==v.z; };
+	bool operator==(Vector &v) const { return x==v.x && y==v.y && z==v.z; };
 
 	// normalize
-	Vector3 normalize()
+	Vector normalize()
 	{
-		Float f = 1.0f / mag();
-		x *= f;
-		y *= f;
-		z *= f;
+		const Float f = 1.0f / mag();
+		*this *= f;
 		return *this;
 	};
 
 	// get normalized copy
-	friend Vector3 normalize(Vector3 &v)
+	friend Vector normalize(const Vector &v)
 	{
 		const Float f = 1.0f / v.mag();
 		return v * f;
 	};
 
 	// square magnitude, magnitude
-	Float mag2() const	{ return x * x + y * y + z * z; };
+	Float mag2() const	{ return dot(*this, *this); };
 	Float mag() const	{ return sqrtf(mag2()); };
 
 	// negative
-	Vector3 operator-() const { return Vector3(-x, -y, -z); };
+	Vector operator-() const { return Vector(-x, -y, -z); };
 
 	// accumulate
-	Vector3 operator+=(const Vector3 &v)
+	Vector operator+=(const Vector &v)
 	{
+#ifdef NO_SSE
 		x += v.x;
 		y += v.y;
 		z += v.z;
+#else
+		mps = _mm_add_ps(mps, v.mps);
+#endif
 		return *this;
 	};
 
-	// cut
-	Vector3 operator/=(const Float &f)
+	// multiply
+	Vector operator*=(const Float &f)
 	{
-		x /= f;
-		y /= f;
-		z /= f;
+		x *= f;
+		y *= f;
+		z *= f;
+		return *this;
+	};
+
+
+	// cut
+	Vector operator/=(const Float &f)
+	{
+		Float finv = 1./f;
+		x *= finv;
+		y *= finv;
+		z *= finv;
 		return *this;
 	};
 
 	// sum
-	friend Vector3 operator+(const Vector3 &a, const Vector3 &b)
+	friend Vector operator+(const Vector &a, const Vector &b)
 	{
-		return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
+#ifdef NO_SSE
+		return Vector(a.x + b.x, a.y + b.y, a.z + b.z);
+#else
+		return Vector(_mm_add_ps(a.mps, b.mps));
+#endif
 	};
 
 	// difference
-	friend Vector3 operator-(const Vector3 &a, const Vector3 &b)
+	friend Vector operator-(const Vector &a, const Vector &b)
 	{
-		return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
+#ifdef NO_SSE
+		return Vector(a.x - b.x, a.y - b.y, a.z - b.z);
+#else
+		return Vector(_mm_sub_ps(a.mps, b.mps));
+#endif
 	};
 
 	// dot product
-	friend Float dot(const Vector3 &a, const Vector3 &b)
+	friend Float dot(const Vector &a, const Vector &b)
 	{
 		return a.x * b.x + a.y * b.y + a.z * b.z;
 	};
 
 	// cross product
-	friend Vector3 cross(const Vector3 &a, const Vector3 &b)
+	friend Vector cross(const Vector &a, const Vector &b)
 	{
-		return Vector3(a.y * b.z - a.z * b.y,
+		return Vector(a.y * b.z - a.z * b.y,
 			a.z * b.x - a.x * b.z,
 			a.x * b.y - a.y * b.x);
 	};
 
 	// product of vector and scalar
-	friend Vector3 operator*(const Vector3 &v, const Float &f)
+	friend Vector operator*(const Vector &v, const Float &f)
 	{
-		return Vector3(f * v.x, f * v.y, f * v.z);
+		return Vector(f * v.x, f * v.y, f * v.z);
 	};
 
-	friend Vector3 operator*(const Float &f, const Vector3 &v)
+	friend Vector operator*(const Float &f, const Vector &v)
 	{
 		return v * f;
 	};
 
 	// scalar division
-	friend Vector3 operator/(const Vector3 &v, const Float &f)
+	friend Vector operator/(const Vector &v, const Float &f)
 	{
-		return Vector3(v.x / f, v.y / f, v.z / f);
+		const Float finv = 1./f;
+		return Vector(v.x * finv, v.y * finv, v.z * finv);
 	};
 
-	friend Vector3 operator/(const Float &f, const Vector3 &v)
+	friend Vector operator/(const Float &f, const Vector &v)
 	{
-		return Vector3(f / v.x, f / v.y, f / v.z);
+#ifdef NO_SSE
+		return Vector(f / v.x, f / v.y, f / v.z);
+#else
+		return Vector(_mm_div_ps(_mm_set_ps1(f), v.mps));
+#endif
 	};
 
 	// vector plus scalar
-	friend Vector3 operator+(const Vector3 &v, const Float &f)
+	friend Vector operator+(const Vector &v, const Float &f)
 	{
-		return Vector3(v.x + f, v.y + f, v.z + f);
+		return Vector(v.x + f, v.y + f, v.z + f);
 	};
 
 	// vector minus scalar
-	friend Vector3 operator-(const Vector3 &v, const Float &f)
+	friend Vector operator-(const Vector &v, const Float &f)
 	{
-		return Vector3(v.x - f, v.y - f, v.z - f);
+		return Vector(v.x - f, v.y - f, v.z - f);
 	};
 
 	// cell by cell product (only usable for colours)
-	friend Vector3 operator*(const Vector3 &a,  const Vector3 &b)
+	friend Vector operator*(const Vector &a,  const Vector &b)
 	{
-		return Vector3(a.x * b.x, a.y * b.y, a.z * b.z);
+#ifdef NO_SSE
+		return Vector(a.x * b.x, a.y * b.y, a.z * b.z);
+#else
+		return Vector(_mm_mul_ps(a.mps, b.mps));
+#endif
 	};
 
 	// write
-	friend ostream & operator<<(ostream &st, const Vector3 &v)
+	friend ostream & operator<<(ostream &st, const Vector &v)
 	{
 		return st << "(" << v.x << "," << v.y  << "," << v.z << ")";
 	};
 
 	// read
-	friend istream & operator>>(istream &st, Vector3 &v)
+	friend istream & operator>>(istream &st, Vector &v)
 	{
 		char s[10];
 		st.getline(s, 10, '(');
@@ -189,8 +221,9 @@
 	};
 };
 
-typedef Vector3 Colour;
+typedef Vector Colour;
 
+#ifndef NO_SSE
 class VectorPacket
 {
 public:
@@ -211,12 +244,17 @@
 	VectorPacket() {};
 	VectorPacket(__m128 ax, __m128 ay, __m128 az):
 		mx(ax), my(ay), mz(az) {};
-	VectorPacket(const Vector3 &v):
+	VectorPacket(const Vector &v):
 		mx(_mm_set_ps1(v.x)), my(_mm_set_ps1(v.y)), mz(_mm_set_ps1(v.z)) {};
 
-	Vector3 getVector(int i) const
+	Vector getVector(int i) const
 	{
-		return Vector3(x[i], y[i], z[i]);
+		return Vector(x[i], y[i], z[i]);
+	};
+
+	void setVector(int i, const Vector &v)
+	{
+		x[i] = v.x; y[i] = v.y; z[i] = v.z;
 	};
 
 	void normalize()
@@ -234,6 +272,39 @@
 		mz = _mm_mul_ps(mz, m);
 	};
 
+	// accumulate
+	VectorPacket operator+=(const VectorPacket &v)
+	{
+		mx = _mm_add_ps(mx, v.mx);
+		my = _mm_add_ps(my, v.my);
+		mz = _mm_add_ps(mz, v.mz);
+		return *this;
+	};
+
+	// add to non-masked components
+	VectorPacket selectiveAdd(__m128 mask, const VectorPacket &v)
+	{
+		mx = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mx, v.mx)),
+			_mm_andnot_ps(mask, mx));
+		my = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(my, v.my)),
+			_mm_andnot_ps(mask, my));
+		mz = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mz, v.mz)),
+			_mm_andnot_ps(mask, mz));
+		return *this;
+	};
+
+	// add scalar to non-masked components
+	VectorPacket selectiveAdd(__m128 mask, const __m128 m)
+	{
+		mx = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mx, m)),
+			_mm_andnot_ps(mask, mx));
+		my = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(my, m)),
+			_mm_andnot_ps(mask, my));
+		mz = _mm_or_ps(_mm_and_ps(mask, _mm_add_ps(mz, m)),
+			_mm_andnot_ps(mask, mz));
+		return *this;
+	};
+
 	// dot product
 	friend __m128 dot(const VectorPacket &a, const VectorPacket &b)
 	{
@@ -275,6 +346,15 @@
 			_mm_div_ps(m, v.mz));
 	};
 
+	// cell by cell product (only usable for colours)
+	friend VectorPacket operator*(const VectorPacket &a,  const VectorPacket &b)
+	{
+		return VectorPacket(
+			_mm_mul_ps(a.mx, b.mx),
+			_mm_mul_ps(a.my, b.my),
+			_mm_mul_ps(a.mz, b.mz));
+	};
+
 	// write to character stream
 	friend ostream & operator<<(ostream &st, const VectorPacket &v)
 	{
@@ -283,5 +363,6 @@
 	};
 
 };
+#endif
 
 #endif
--- a/src/SConscript	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/SConscript	Fri May 02 13:27:47 2008 +0200
@@ -1,6 +1,6 @@
-Import('env')
-env.Append(CPPPATH = '#include')
+Import('env buildmodule')
 
+env = env.Clone(CPPPATH = '#include')
 pyenv = env.Clone()
 if env['PLATFORM'] == 'win32':
 	import sys
@@ -26,17 +26,16 @@
 	objs.append( env.Object(src) )
 	shared_objs.append( env.SharedObject(src) )
 
-pymodule = pyenv.SharedLibrary('pyrit',
-	['raytracermodule.cc']+shared_objs,
-	SHLIBPREFIX = '',
-	CCFLAGS = '$CCFLAGS -Wno-write-strings')
-
-lib = env.StaticLibrary('pyrit', objs)
-
-env.Alias('objs', objs)
-env.Alias('static-lib', lib)
-env.Alias('shared-objs', shared_objs)
-env.Alias('python-module', pymodule)
-env.Alias('libs', ['static-lib', 'python-module'])
-
-Return('lib pymodule')
+if buildmodule:
+	pymodule = pyenv.SharedLibrary('pyrit',
+		['raytracermodule.cc']+shared_objs,
+		SHLIBPREFIX = '',
+		CCFLAGS = '$CCFLAGS -Wno-write-strings')
+	env.Alias('shared-objs', shared_objs)
+	env.Alias('python-module', pymodule)
+	Return('pymodule')
+else:
+	lib = env.StaticLibrary('pyrit', objs)
+	env.Alias('objs', objs)
+	env.Alias('static-lib', lib)
+	Return('lib')
--- a/src/container.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/container.cc	Fri May 02 13:27:47 2008 +0200
@@ -3,7 +3,7 @@
  *
  * This file is part of Pyrit Ray Tracer.
  *
- * Copyright 2007  Radek Brich
+ * Copyright 2007, 2008  Radek Brich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -34,7 +34,7 @@
 	if (shapes.size() == 0) {
 		/* initialize bounding box */
 		bbox = aShape->get_bbox();
-		const Vector3 E(e, e, e);
+		const Vector E(e, e, e);
 		bbox = BBox(bbox.L - E, bbox.H + E);
 	} else {
 		/* adjust bounding box */
@@ -60,6 +60,7 @@
 	return nearest_shape;
 }
 
+#ifndef NO_SSE
 void Container::packet_intersection(const Shape **origin_shapes, const RayPacket &rays,
 	Float *nearest_distances, Shape **nearest_shapes)
 {
@@ -67,6 +68,7 @@
 		nearest_shapes[i] = nearest_intersection(origin_shapes[i], rays[i],
 			nearest_distances[i]);
 }
+#endif
 
 ostream & Container::dump(ostream &st)
 {
--- a/src/kdtree.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/kdtree.cc	Fri May 02 13:27:47 2008 +0200
@@ -3,7 +3,7 @@
  *
  * This file is part of Pyrit Ray Tracer.
  *
- * Copyright 2006, 2007  Radek Brich
+ * Copyright 2006, 2007, 2008  Radek Brich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -54,7 +54,7 @@
 {
 	KdNode* node; /* pointer to far child */
 	Float t; /* the entry/exit signed distance */
-	Vector3 pb; /* the coordinates of entry/exit point */
+	Vector pb; /* the coordinates of entry/exit point */
 	int prev;
 };
 
@@ -150,7 +150,7 @@
 // export kd-tree as .obj for visualization
 // this would be hard to reconstruct later
 	static ofstream F("kdtree.obj");
-	Vector3 v;
+	Vector v;
 	static int f=0;
 	v.cell[axis] = node->getSplit();
 	v.cell[(axis+1)%3] = bounds.L.cell[(axis+1)%3];
@@ -239,7 +239,7 @@
 	Float splitVal;
 	int axis;
 	static const int mod3[] = {0,1,2,0,1};
-	const Vector3 invdir = 1 / ray.dir;
+	const Vector invdir = 1 / ray.dir;
 	while (node)
 	{
 		/* loop until a leaf is found */
@@ -328,6 +328,7 @@
 	return NULL;
 }
 
+#ifndef NO_SSE
 // stack element for kd-tree traversal (packet version)
 struct StackElem4
 {
@@ -514,6 +515,7 @@
 
 	/* ray leaves the scene */
 }
+#endif
 
 ostream & operator<<(ostream &st, KdNode &node)
 {
--- a/src/octree.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/octree.cc	Fri May 02 13:27:47 2008 +0200
@@ -3,7 +3,7 @@
  *
  * This file is part of Pyrit Ray Tracer.
  *
- * Copyright 2007  Radek Brich
+ * Copyright 2007, 2008  Radek Brich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -170,8 +170,8 @@
 #	define tzm	st_cur->tzm
 
 	int a = 0;
-	Vector3 ro(ray.o);
-	Vector3 rdir(ray.dir);
+	Vector ro(ray.o);
+	Vector rdir(ray.dir);
 
 	if (rdir.x < 0.0)
 	{
--- a/src/raytracer.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/raytracer.cc	Fri May 02 13:27:47 2008 +0200
@@ -34,7 +34,7 @@
 
 // Hammersley spherical point distribution
 // http://www.cse.cuhk.edu.hk/~ttwong/papers/udpoint/udpoints.html
-Vector3 Raytracer::SphereDistribute(int i, int n, Float extent, Vector3 &normal)
+Vector Raytracer::SphereDistribute(int i, int n, Float extent, const Vector &normal)
 {
 	Float p, t, st, phi, phirad;
 	int kk;
@@ -67,23 +67,7 @@
 	y = xx*sin(q) + yy*cos(q);
 	z = zz;
 
-	return Vector3(x, y, z);
-}
-
-// ---- tyto dve funkce budou v budouci verzi metody objektu PhongShader
-
-// calculate shader function
-// P is point of intersection, N normal in this point
-Colour PhongShader_ambient(const Material &mat, const Vector3 &P)
-{
-	Colour col;
-	if (mat.texture)
-		col = mat.texture->evaluate(P);
-	else
-		col = mat.colour;
-
-	// ambient
-	return mat.ambient * col;
+	return Vector(x, y, z);
 }
 
 /*
@@ -92,82 +76,138 @@
  R direction of reflected ray,
  V direction to the viewer
 */
-Colour PhongShader_calculate(const Material &mat,
-	const Vector3 &P, const Vector3 &N, const Vector3 &R, const Vector3 &V,
-	const Light &light)
+Colour Raytracer::PhongShader(const Shape *shape,
+	const Vector &P, const Vector &N, const Vector &V)
 {
-	Colour I = Colour();
-	Vector3 L = light.pos - P;
-	L.normalize();
-	Float L_dot_N = dot(L, N);
-	Float R_dot_V = dot(R, V);
-
-	Colour col;
-	if (mat.texture)
-		col = mat.texture->evaluate(P);
-	else
-		col = mat.colour;
-
-	// diffuse
-	I = mat.diffuse * col * light.colour * L_dot_N;
+	Colour col, acc;
+	Material * const &mat = shape->material;
 
-	// specular
-	if (R_dot_V > 0)
-		I += mat.specular * light.colour * powf(R_dot_V, mat.shininess);
-	return I;
-}
+	if (mat->texture)
+		col = mat->texture->evaluate(P);
+	else
+		col = mat->colour;
 
-Colour Raytracer::shader_evalulate(const Ray &ray, int depth, Shape *origin_shape,
-	Float nearest_distance, Shape *nearest_shape)
-{
-	Colour col = Colour();
-	Vector3 P = ray.o + ray.dir * nearest_distance; // point of intersection
-	Vector3 normal = nearest_shape->normal(P);
-	bool from_inside = false;
-
-	// make shapes double sided
-	if (dot(normal, ray.dir) > 0.0)
-	{
-		normal = - normal;
-		from_inside = true;
-	}
-
-	col = PhongShader_ambient(*nearest_shape->material, P);
+	// ambient
+	acc = mat->ambient * col;
 
 	vector<Light*>::iterator light;
-	for (light = lights.begin(); light != lights.end(); light++) {
-		Vector3 jo, L = (*light)->pos - P; // direction vector to light
-		L.normalize();
-		Float L_dot_N = dot(L, normal);
-		if (L_dot_N > 0) {
+	for (light = lights.begin(); light != lights.end(); light++)
+	{
+		const Vector L = normalize((*light)->pos - P); // direction vector to light
+		const Float L_dot_N = dot(L, N);
+		if (L_dot_N > 0)
+		{
 			// test if this light is occluded (sharp shadows)
 			if ((*light)->cast_shadows) {
-				Ray shadow_ray = Ray(P, L);
+				const Ray shadow_ray = Ray(P, L);
 				Float dist = FLT_MAX;
-				if (top->nearest_intersection(nearest_shape, shadow_ray, dist))
+				if (top->nearest_intersection(shape, shadow_ray, dist))
 					continue;
 			}
 
-			// shading function
-			Vector3 R = L - 2.0 * L_dot_N * normal;
-			col += PhongShader_calculate(*nearest_shape->material,
-				P, normal, R, ray.dir, **light);
+			const Vector R = L - 2.0 * L_dot_N * N;
+			const Float R_dot_V = dot(R, V);
+
+			// diffuse
+			acc += mat->diffuse * col * (*light)->colour * L_dot_N;
+
+			// specular
+			if (R_dot_V > 0)
+				acc += mat->specular * (*light)->colour * powf(R_dot_V, mat->shininess);
 		}
 	}
 
+	return acc;
+}
+
+#ifndef NO_SSE
+VectorPacket Raytracer::PhongShader_packet(const Shape **shapes,
+	const VectorPacket &P, const VectorPacket &N, const VectorPacket &V)
+{
+	VectorPacket acc, colour;
+	union { __m128 ambient; float ambient_f[4]; };
+	union { __m128 diffuse; float diffuse_f[4]; };
+	union { __m128 specular; float specular_f[4]; };
+	union { __m128 shininess; float shininess_f[4]; };
+
+	for (int i = 0; i < 4; i++)
+		if (shapes[i] == NULL)
+		{
+			ambient_f[i] = 0;
+			diffuse_f[i] = 0;
+			specular_f[i] = 0;
+			shininess_f[i] = 0;
+		}
+		else
+		{
+			Material * const &mat = shapes[i]->material;
+			if (mat->texture)
+				colour.setVector(i, mat->texture->evaluate(P.getVector(i)));
+			else
+				colour.setVector(i, mat->colour);
+			ambient_f[i] = mat->ambient;
+			diffuse_f[i] = mat->diffuse;
+			specular_f[i] = mat->specular;
+			shininess_f[i] = mat->shininess;
+		}
+
+	// ambient
+	acc = colour * ambient;
+
+	Shape **shadow_shapes;
+	vector<Light*>::iterator light;
+	for (light = lights.begin(); light != lights.end(); light++)
+	{
+		 // direction vector to light
+		VectorPacket L = VectorPacket((*light)->pos) - P;
+		L.normalize();
+		const __m128 L_dot_N = dot(L, N);
+		__m128 valid = _mm_cmpgt_ps(L_dot_N, mZero);
+
+		// test if this light is occluded (sharp shadows)
+		if ((*light)->cast_shadows)
+		{
+			const RayPacket shadow_rays = RayPacket(P, L);
+			union { __m128 dists; float dists_f[4]; };
+			dists = mInf;
+			top->packet_intersection(shapes, shadow_rays,
+				dists_f, shadow_shapes);
+			valid = _mm_and_ps(valid, _mm_cmpeq_ps(dists, mInf));
+		}
+
+		const VectorPacket R = L - N * _mm_mul_ps(mTwo, L_dot_N);
+		const __m128 R_dot_V = dot(R, V);
+
+		// diffuse
+		acc.selectiveAdd(valid,
+			colour * VectorPacket((*light)->colour) * _mm_mul_ps(diffuse, L_dot_N));
+
+		// specular
+		valid = _mm_and_ps(valid, _mm_cmpgt_ps(R_dot_V, mZero));
+		__m128 spec = _mm_mul_ps(_mm_mul_ps(specular, _mm_set_ps1((*light)->colour.r)),
+			_mm_fastpow(R_dot_V, shininess));
+		acc.selectiveAdd(valid, spec);
+	}
+	return acc;
+}
+#endif
+
+void Raytracer::lightScatter(const Ray &ray, const Shape *shape, int depth,
+	const Vector &P, const Vector &normal, bool from_inside, Colour &col)
+{
 	if (depth < max_depth)
 	{
 		Colour trans_col, refl_col;
-		Float trans = nearest_shape->material->transmissivity;
-		Float refl = nearest_shape->material->reflectivity;
+		Float trans = shape->material->transmissivity;
+		Float refl = shape->material->reflectivity;
 		const Float cos_i = - dot(normal, ray.dir);
 
 		// reflection
 		if (refl > 0.01)
 		{
-			Vector3 newdir = ray.dir + 2.0 * cos_i * normal;
+			Vector newdir = ray.dir + 2.0 * cos_i * normal;
 			Ray newray = Ray(P, newdir);
-			refl_col = raytrace(newray, depth + 1, nearest_shape);
+			refl_col = raytrace(newray, depth + 1, shape);
 		}
 
 		// refraction
@@ -176,14 +216,14 @@
 			Float n, n1, n2;
 			if (from_inside)
 			{
-				n1 = nearest_shape->material->refract_index;
+				n1 = shape->material->refract_index;
 				n2 = 1.0;
 				n = n1;
 			}
 			else
 			{
 				n1 = 1.0;
-				n2 = nearest_shape->material->refract_index;
+				n2 = shape->material->refract_index;
 				n = 1.0 / n2;
 			}
 			const Float sin2_t = n*n * (1 - cos_i*cos_i);
@@ -202,7 +242,7 @@
 				const Float R = (Rper*Rper + Rpar*Rpar)/2;
 				refl += R*trans;
 				trans = (1-R)*trans;
-				Vector3 newdir = n * ray.dir + (n*cos_i - cos_t) * normal;
+				Vector newdir = n * ray.dir + (n*cos_i - cos_t) * normal;
 				Ray newray = Ray(P + 0.001*newdir, newdir);
 				trans_col = raytrace(newray, depth + 1, NULL);
 			}
@@ -211,14 +251,15 @@
 	}
 
 	// ambient occlusion
-	if (!from_inside && ao_samples)
+	if (ao_samples && !from_inside)
 	{
 		Float miss = 0;
-		for (int i = 0; i < ao_samples; i++) {
-			Vector3 dir = SphereDistribute(i, ao_samples, ao_angle, normal);
+		for (int i = 0; i < ao_samples; i++)
+		{
+			Vector dir = SphereDistribute(i, ao_samples, ao_angle, normal);
 			Ray ao_ray = Ray(P, dir);
 			Float dist = ao_distance;
-			Shape *shape_in_way = top->nearest_intersection(nearest_shape, ao_ray, dist);
+			Shape *shape_in_way = top->nearest_intersection(shape, ao_ray, dist);
 			if (shape_in_way == NULL)
 				miss += 1.0;
 			else
@@ -227,11 +268,9 @@
 		Float ao_intensity = miss / ao_samples;
 		col = col * ao_intensity;
 	}
-
-	return col;
 }
 
-Colour Raytracer::raytrace(Ray &ray, int depth, Shape *origin_shape)
+Colour Raytracer::raytrace(Ray &ray, int depth, const Shape *origin_shape)
 {
 	Float nearest_distance = Inf;
 	Shape *nearest_shape = top->nearest_intersection(origin_shape, ray, nearest_distance);
@@ -239,23 +278,83 @@
 	if (nearest_shape == NULL)
 		return bg_colour;
 	else
-		return shader_evalulate(ray, depth, origin_shape, nearest_distance, nearest_shape);
+	{
+		const Vector P = ray.o + ray.dir * nearest_distance; // point of intersection
+		Vector normal = nearest_shape->normal(P);
+		bool from_inside = false;
+
+		// make shapes double sided
+		if (dot(normal, ray.dir) > 0.0)
+		{
+			normal = - normal;
+			from_inside = true;
+		}
+
+		// shading function
+		Colour col = PhongShader(nearest_shape, P, normal, ray.dir);
+		lightScatter(ray, nearest_shape, depth, P, normal, from_inside, col);
+		return col;
+	}
 }
 
+#ifndef NO_SSE
 void Raytracer::raytracePacket(RayPacket &rays, Colour *results)
 {
-	Float nearest_distances[4] = {Inf,Inf,Inf,Inf};
+	union {
+		float nearest_distances[4];
+		__m128 m_nearest_distances;
+	};
+	__m128 mask;
 	Shape *nearest_shapes[4];
 	static const Shape *origin_shapes[4] = {NULL, NULL, NULL, NULL};
+	m_nearest_distances = mInf;
+	mask = mAllSet;
+
 	top->packet_intersection(origin_shapes, rays, nearest_distances, nearest_shapes);
 
+	mask = _mm_cmpneq_ps(m_nearest_distances, mInf);
+	if (!_mm_movemask_ps(mask))
+	{
+		for (int i = 0; i < 4; i++)
+			results[i] = bg_colour;
+		return;
+	}
+
+	const VectorPacket P = rays.o + rays.dir * m_nearest_distances; // point of intersection
+
+	VectorPacket normal;
+	for (int i = 0; i < 4; i++)
+		if (nearest_shapes[i] != NULL)
+			normal.setVector(i, nearest_shapes[i]->normal(P.getVector(i)));
+
+	// make shapes double sided
+	__m128 from_inside = _mm_cmpgt_ps(dot(normal, rays.dir), mZero);
+	normal.mx = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.mx)),
+		_mm_andnot_ps(from_inside, normal.mx));
+	normal.my = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.my)),
+		_mm_andnot_ps(from_inside, normal.my));
+	normal.mz = _mm_or_ps(_mm_and_ps(from_inside, _mm_sub_ps(mZero, normal.mz)),
+		_mm_andnot_ps(from_inside, normal.mz));
+
+	// shading function
+	VectorPacket pres =
+		PhongShader_packet(const_cast<const Shape**>(nearest_shapes), P, normal, rays.dir);
+	//pres.mx = _mm_or_ps(_mm_and_ps(mask, pres.mx), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.r)));
+	//pres.my = _mm_or_ps(_mm_and_ps(mask, pres.my), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.g)));
+	//pres.mz = _mm_or_ps(_mm_and_ps(mask, pres.mz), _mm_andnot_ps(mask, _mm_set_ps1(bg_colour.b)));
+
 	for (int i = 0; i < 4; i++)
-		if (nearest_shapes[i] == NULL)
-			results[i] = bg_colour;
+		if (nearest_shapes[i] != NULL)
+		{
+			results[i] = pres.getVector(i);
+			lightScatter(rays[i], nearest_shapes[i], 0,
+				P.getVector(i), normal.getVector(i), (_mm_movemask_ps(from_inside)>>i)&1,
+				results[i]);
+		}
 		else
-			results[i] = shader_evalulate(rays[i], 0, NULL,
-				nearest_distances[i], nearest_shapes[i]);
+			results[i] = bg_colour;
 }
+#endif
 
 void *Raytracer::raytrace_worker(void *d)
 {
@@ -265,8 +364,10 @@
 	Colour my_colours[my_queue_size];
 	int my_count;
 	Ray ray;
+#ifndef NO_SSE
 	RayPacket rays;
 	const bool can_use_packets = (rt->use_packets && rt->sampler->packetableSamples());
+#endif
 	for (;;)
 	{
 		pthread_mutex_lock(&rt->sample_queue_mutex);
@@ -306,6 +407,7 @@
 		pthread_mutex_unlock(&rt->sample_queue_mutex);
 
 		// do the work
+#ifndef NO_SSE
 		if (can_use_packets)
 		{
 			// packet ray tracing
@@ -317,6 +419,7 @@
 			}
 		}
 		else
+#endif
 		{
 			// single ray tracing
 			for (int i = 0; i < my_count; i++)
--- a/src/raytracermodule.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/raytracermodule.cc	Fri May 02 13:27:47 2008 +0200
@@ -122,7 +122,7 @@
 		return NULL;
 
 	v = PyObject_New(LightObject, &LightType);
-	v->light = new Light(Vector3(px, py, pz), Colour(cr, cg, cb));
+	v->light = new Light(Vector(px, py, pz), Colour(cr, cg, cb));
 	return (PyObject*)v;
 }
 
@@ -222,11 +222,11 @@
 
 	v = PyObject_New(CameraObject, &CameraType);
 	if (TLookAt)
-		v->camera = new Camera(Vector3(ex, ey, ez),
-			Vector3(lax, lay, laz), Vector3(upx, upy, upz));
+		v->camera = new Camera(Vector(ex, ey, ez),
+			Vector(lax, lay, laz), Vector(upx, upy, upz));
 	else
-		v->camera = new Camera(Vector3(ex, ey, ez),
-			Vector3(px, py, pz), Vector3(ux, uy, uz), Vector3(vx, vy, vz));
+		v->camera = new Camera(Vector(ex, ey, ez),
+			Vector(px, py, pz), Vector(ux, uy, uz), Vector(vx, vy, vz));
 	return (PyObject*)v;
 }
 
@@ -248,7 +248,7 @@
 		if (!PyArg_ParseTuple(TEye, "fff", &ex, &ey, &ez))
 			return NULL;
 
-	((CameraObject *)self)->camera->setEye(Vector3(ex, ey, ez));
+	((CameraObject *)self)->camera->setEye(Vector(ex, ey, ez));
 
 	Py_INCREF(Py_None);
 	return Py_None;
@@ -455,7 +455,7 @@
 				return NULL;
 
 		v = PyObject_New(NormalVertexObject, &NormalVertexType);
-		v->nvertex = new NormalVertex(Vector3(vx, vy, vz), Vector3(nx, ny, nz));
+		v->nvertex = new NormalVertex(Vector(vx, vy, vz), Vector(nx, ny, nz));
 	}
 	return (PyObject*)v;
 }
@@ -477,7 +477,7 @@
 	if (!PyArg_ParseTuple(TNor, "fff", &nx, &ny, &nz))
 		return NULL;
 
-	((NormalVertexObject *)self)->nvertex->setNormal(Vector3(nx,ny,nz).normalize());
+	((NormalVertexObject *)self)->nvertex->setNormal(Vector(nx,ny,nz).normalize());
 
 	Py_INCREF(Py_None);
 	return Py_None;
@@ -567,7 +567,7 @@
 {
 	PyObject *obj;
 
-	Vector3 N = ((Triangle*)((TriangleObject *)self)->shape.shape)->getNormal();
+	Vector N = ((Triangle*)((TriangleObject *)self)->shape.shape)->getNormal();
 
 	obj = Py_BuildValue("(fff)", N.x, N.y, N.z);
 	return obj;
@@ -614,7 +614,7 @@
 		return NULL;
 
 	v = PyObject_New(SphereObject, &SphereType);
-	v->shape.shape = new Sphere(Vector3(cx, cy, cz), radius, material->material);
+	v->shape.shape = new Sphere(Vector(cx, cy, cz), radius, material->material);
 	Py_INCREF(material);
 	return (PyObject*)v;
 }
@@ -664,7 +664,7 @@
 		return NULL;
 
 	v = PyObject_New(BoxObject, &BoxType);
-	v->shape.shape = new Box(Vector3(lx, ly, lz), Vector3(hx, hy, hz), material->material);
+	v->shape.shape = new Box(Vector(lx, ly, lz), Vector(hx, hy, hz), material->material);
 	Py_INCREF(material);
 	return (PyObject*)v;
 }
--- a/src/scene.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/scene.cc	Fri May 02 13:27:47 2008 +0200
@@ -53,15 +53,15 @@
 	x = 2*( (t8 + t10)*p.x + (t6 -  t4)*p.y + (t3 + t7)*p.z ) + p.x;
 	y = 2*( (t4 +  t6)*p.x + (t5 + t10)*p.y + (t9 - t2)*p.z ) + p.y;
 	z = 2*( (t7 -  t3)*p.x + (t2 +  t9)*p.y + (t5 + t8)*p.z ) + p.z;
-	p = Vector3(x,y,z);
+	p = Vector(x,y,z);
 	x = 2*( (t8 + t10)*u.x + (t6 -  t4)*u.y + (t3 + t7)*u.z ) + u.x;
 	y = 2*( (t4 +  t6)*u.x + (t5 + t10)*u.y + (t9 - t2)*u.z ) + u.y;
 	z = 2*( (t7 -  t3)*u.x + (t2 +  t9)*u.y + (t5 + t8)*u.z ) + u.z;
-	u = Vector3(x,y,z);
+	u = Vector(x,y,z);
 	x = 2*( (t8 + t10)*v.x + (t6 -  t4)*v.y + (t3 + t7)*v.z ) + v.x;
 	y = 2*( (t4 +  t6)*v.x + (t5 + t10)*v.y + (t9 - t2)*v.z ) + v.y;
 	z = 2*( (t7 -  t3)*v.x + (t2 +  t9)*v.y + (t5 + t8)*v.z ) + v.z;
-	v = Vector3(x,y,z);
+	v = Vector(x,y,z);
 	p.normalize();
 	u.normalize();
 	v.normalize();
@@ -108,6 +108,7 @@
 	return true;
 }
 
+#ifndef NO_SSE
 // rewrite of BBox::intersect for ray packets
 __m128 BBox::intersect_packet(const RayPacket &rays, __m128 &a, __m128 &b)
 {
@@ -149,3 +150,4 @@
 	b = tfar;
 	return mask;
 }
+#endif
--- a/src/serialize.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/serialize.cc	Fri May 02 13:27:47 2008 +0200
@@ -68,7 +68,7 @@
 		// Vertex
 		if (s.compare("(v") == 0)
 		{
-			Vector3 P;
+			Vector P;
 			st >> P;
 			getline(st, s, ')');
 			vertices.push_back(new Vertex(P));
@@ -79,7 +79,7 @@
 		// NormalVertex
 		if (s.compare("(vn") == 0)
 		{
-			Vector3 P,N;
+			Vector P,N;
 			st >> P;
 			getline(st, s, ',');
 			st >> N;
@@ -105,7 +105,7 @@
 		// box
 		if (s.compare("(box") == 0)
 		{
-			Vector3 L,H;
+			Vector L,H;
 			st >> L;
 			getline(st, s, ',');
 			st >> H;
@@ -116,7 +116,7 @@
 		// Sphere
 		if (s.compare("(sphere") == 0)
 		{
-			Vector3 center;
+			Vector center;
 			Float radius;
 			st >> center;
 			getline(st, s, ',');
--- a/src/shapes.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/src/shapes.cc	Fri May 02 13:27:47 2008 +0200
@@ -29,7 +29,7 @@
 
 bool Sphere::intersect(const Ray &ray, Float &dist) const
 {
-	Vector3 V = ray.o - center;
+	Vector V = ray.o - center;
 	register Float d = -dot(V, ray.dir);
 	register Float Det = d * d - (dot(V,V) - sqr_radius);
 	register Float t1,t2;
@@ -54,6 +54,7 @@
 	return false;
 }
 
+#ifndef NO_SSE
 __m128 Sphere::intersect_packet(const RayPacket &rays, __m128 &dists)
 {
 	VectorPacket V = rays.o - VectorPacket(center);
@@ -79,13 +80,14 @@
 	dists = _mm_or_ps(_mm_and_ps(mask, newdists), _mm_andnot_ps(mask, dists));
 	return mask;
 }
+#endif
 
 /* if there should be CSG sometimes, this may be needed... */
 bool Sphere::intersect_all(const Ray &ray, Float dist, vector<Float> &allts) const
 {
 	//allts = new vector<Float>();
 
-	Vector3 V = ((Ray)ray).o - center;
+	Vector V = ((Ray)ray).o - center;
 	Float Vd = - dot(V, ray.dir);
 	Float Det = Vd * Vd - (dot(V,V) - sqr_radius);
 
@@ -175,6 +177,7 @@
 	return false;
 }
 
+#ifndef NO_SSE
 __m128 Box::intersect_packet(const RayPacket &rays, __m128 &dists)
 {
 	register __m128 tnear = mZero;
@@ -215,6 +218,7 @@
 	dists = _mm_or_ps(_mm_and_ps(mask, tnear), _mm_andnot_ps(mask, dists));
 	return mask;
 }
+#endif
 
 bool Box::intersect_bbox(const BBox &bbox) const
 {
@@ -224,10 +228,10 @@
 	H.z > bbox.L.z && L.z < bbox.H.z);
 }
 
-const Vector3 Box::normal(const Vector3 &P) const
+const Vector Box::normal(const Vector &P) const
 {
-	register Vector3 l = P - L;
-	register Vector3 h = H - P;
+	register Vector l = P - L;
+	register Vector h = H - P;
 
 	if (l.x < h.x)
 		h.x = -1;
@@ -273,7 +277,7 @@
 }
 
 #ifdef TRI_PLUCKER
-inline void Plucker(const Vector3 &p, const Vector3 &q, Float* pl)
+inline void Plucker(const Vector &p, const Vector &q, Float* pl)
 {
     pl[0] = p.x*q.y - q.x*p.y;
     pl[1] = p.x*q.z - q.x*p.z;
@@ -294,8 +298,8 @@
 {
 	material = amaterial;
 
-	const Vector3 c = B->P - A->P;
-	const Vector3 b = C->P - A->P;
+	const Vector c = B->P - A->P;
+	const Vector b = C->P - A->P;
 
 	N = cross(c, b);
 	N.normalize();
@@ -363,8 +367,8 @@
 
 #if defined(TRI_BARI) || defined(TRI_BARI_PRE)
 	static const int modulo3[5] = {0,1,2,0,1};
-	const Vector3 &O = ray.o;
-	const Vector3 &D = ray.dir;
+	const Vector &O = ray.o;
+	const Vector &D = ray.dir;
 	register const int u = modulo3[k+1];
 	register const int v = modulo3[k+2];
 #endif
@@ -392,8 +396,8 @@
 #ifdef TRI_BARI
 	// original barycentric coordinates based intesection
 	// not optimized, just for reference
-	const Vector3 c = B - A;
-	const Vector3 b = C - A;
+	const Vector c = B - A;
+	const Vector b = C - A;
 	// distance test
 	const Float t = - dot( (O-A), N) / dot(D,N);
 	if (t < Eps || t > dist)
@@ -415,7 +419,7 @@
 #endif
 }
 
-#ifdef TRI_BARI_PRE
+#if not defined(NO_SSE) and defined(TRI_BARI_PRE)
 __m128 Triangle::intersect_packet(const RayPacket &rays, __m128 &dists)
 {
 	static const int modulo3[5] = {0,1,2,0,1};
@@ -463,14 +467,14 @@
 
 bool Triangle::intersect_bbox(const BBox &bbox) const
 {
-	const Vector3 boxcenter = (bbox.L+bbox.H)*0.5;
-	const Vector3 boxhalfsize = (bbox.H-bbox.L)*0.5;
-	const Vector3 v0 = A->P - boxcenter;
-	const Vector3 v1 = B->P - boxcenter;
-	const Vector3 v2 = C->P - boxcenter;
-	const Vector3 e0 = v1-v0;
-	const Vector3 e1 = v2-v1;
-	const Vector3 e2 = v0-v2;
+	const Vector boxcenter = (bbox.L+bbox.H)*0.5;
+	const Vector boxhalfsize = (bbox.H-bbox.L)*0.5;
+	const Vector v0 = A->P - boxcenter;
+	const Vector v1 = B->P - boxcenter;
+	const Vector v2 = C->P - boxcenter;
+	const Vector e0 = v1-v0;
+	const Vector e1 = v2-v1;
+	const Vector e2 = v0-v2;
 
 	Float fex = fabsf(e0.x);
 	Float fey = fabsf(e0.y);
@@ -569,7 +573,7 @@
 	if(min>boxhalfsize.z || max<-boxhalfsize.z) return false;
 
 	/*  test if the box intersects the plane of the triangle */
-	Vector3 vmin,vmax;
+	Vector vmin,vmax;
 	Float v;
 	for(int q=0;q<3;q++)
 	{
--- a/tests/vector.cc	Tue Apr 29 23:31:08 2008 +0200
+++ b/tests/vector.cc	Fri May 02 13:27:47 2008 +0200
@@ -3,11 +3,11 @@
 
 int main() {
 	{
-	/* Vector3 */
-	Vector3 a(1, 2, 3);
-	cout << "=== Vector3 test ===" << endl;
+	/* Vector */
+	Vector a(1, 2, 3);
+	cout << "=== Vector test ===" << endl;
 	cout << "a = " << a << endl;
-	Vector3 b(2, 3, 2);
+	Vector b(2, 3, 2);
 	cout << "b = " << b << endl;
 
 	cout << "a + b = " << a + b << endl;