diff --git a/Aquaria/AnimationEditor.cpp b/Aquaria/AnimationEditor.cpp
index 0ca2d0d..25c18d3 100644
--- a/Aquaria/AnimationEditor.cpp
+++ b/Aquaria/AnimationEditor.cpp
@@ -240,6 +240,7 @@ void AnimationEditor::applyState()
 	editingBone = 0;
 	currentKey = 0;
 	splinegrid = 0;
+	assistedSplineEdit = true;
 
 	editSprite = new SkeletalSprite();
 	editSprite->cull = false;
@@ -302,6 +303,8 @@ void AnimationEditor::applyState()
 	addAction(MakeFunctionEvent(AnimationEditor, decrTimelineGrid), KEY_O, 0);
 	addAction(MakeFunctionEvent(AnimationEditor, incrTimelineGrid), KEY_P, 0);
 
+	addAction(MakeFunctionEvent(AnimationEditor, toggleSplineMode), KEY_W, 0);
+
 
 
 	addAction(ACTION_SWIMLEFT,	KEY_J, -1);
@@ -462,6 +465,12 @@ void AnimationEditor::applyState()
 	reverseAnim->event.set(MakeFunctionEvent(AnimationEditor, reverseAnim));
 	addRenderObject(reverseAnim, LR_MENU);
 
+	DebugButton *bAssist = new DebugButton(0, 0, 150);
+	bAssist->position = Vector(10, 510);
+	bAssist->event.set(MakeFunctionEvent(AnimationEditor, toggleSplineMode));
+	addRenderObject(bAssist, LR_MENU);
+	bSplineAssist = bAssist;
+
 
 	OutlineRect *rect = new OutlineRect;
 	rect->setWidthHeight(400,400);
@@ -494,6 +503,7 @@ void AnimationEditor::applyState()
 
 	updateTimelineGrid();
 	updateTimelineUnit();
+	updateButtonLabels();
 }
 
 void AnimationEditor::clearUndoHistory()
@@ -1005,18 +1015,24 @@ void AnimationEditor::editStripKey()
 				bgGrad->makeVertical(Vector(0.4f, 0.6f, 0.4f), Vector(0.8f, 1, 0.8f));
 
 				BoneKeyframe *bk = a->getKeyframe(currentKey)->getBoneKeyframe(editingBone->boneIdx);
-				assert(bk->controlpoints.size() == interp->bsp.ctrlX() * interp->bsp.ctrlY());
+				const size_t totalcp = interp->bsp.ctrlX() * interp->bsp.ctrlY();
+				const bool reset = bk->controlpoints.empty();
+				bk->controlpoints.resize(totalcp);
+				assert(!splinegrid);
 
-				splinegrid = new SplineGrid;
+				splinegrid = new SplineGrid();
 				DynamicRenderGrid *rgrid = splinegrid->resize(interp->bsp.ctrlX(), interp->bsp.ctrlY(), grid->width(), grid->height(), interp->bsp.degX(), interp->bsp.degY());
 				rgrid->setDrawOrder(grid->getDrawOrder());
 				splinegrid->setTexture(editingBone->texture->name);
 				splinegrid->setWidthHeight(editingBone->width, editingBone->height);
 				splinegrid->position = Vector(400, 300);
-				//splinegrid->followCamera = 1;
-				splinegrid->importControlPoints(&bk->controlpoints[0]);
-				//editSprite->addChild(splinegrid, PM_STATIC, RBP_OFF, CHILD_FRONT);
-				//editSprite->alphaMod = 0.5f;
+				splinegrid->setAssist(assistedSplineEdit);
+
+				if(reset)
+					splinegrid->resetControlPoints();
+				else
+					splinegrid->importKeyframe(bk);
+
 				addRenderObject(splinegrid, LR_PARTICLES_TOP);
 			}
 			else
@@ -1616,6 +1632,23 @@ void AnimationEditor::decrTimelineGrid()
 	updateTimelineGrid();
 }
 
+void AnimationEditor::toggleSplineMode()
+{
+	assistedSplineEdit = !assistedSplineEdit;
+	updateButtonLabels();
+	if(splinegrid)
+		splinegrid->setAssist(assistedSplineEdit);
+}
+
+void AnimationEditor::updateButtonLabels()
+{
+	{
+		std::ostringstream os;
+		os << "S.Assist (W)(" << (assistedSplineEdit ? "on" : "off") << ")";
+		bSplineAssist->label->setText(os.str());
+	}
+}
+
 void AnimationEditor::updateTimelineGrid()
 {
 	std::ostringstream os;
@@ -1636,9 +1669,9 @@ void AnimationEditor::applyBoneToSplineGrid()
 	{
 		Animation *a = editSprite->getCurrentAnimation();
 		BoneKeyframe *bk = a->getKeyframe(currentKey)->getBoneKeyframe(editingBone->boneIdx);
-		assert(bk->controlpoints.size() == splinegrid->getSpline().ctrlX() * splinegrid->getSpline().ctrlY());
+
 		assert(bk->grid.size() == editingBone->getGrid()->linearsize());
-		splinegrid->importControlPoints(&bk->controlpoints[0]);
+		splinegrid->importKeyframe(bk);
 	}
 }
 
@@ -1648,10 +1681,10 @@ void AnimationEditor::applySplineGridToBone()
 	{
 		Animation *a = editSprite->getCurrentAnimation();
 		BoneKeyframe *bk = a->getKeyframe(currentKey)->getBoneKeyframe(editingBone->boneIdx);
-		assert(bk->controlpoints.size() == splinegrid->getSpline().ctrlX() * splinegrid->getSpline().ctrlY());
 		assert(bk->grid.size() == editingBone->getGrid()->linearsize());
-		splinegrid->exportControlPoints(&bk->controlpoints[0]);
+		splinegrid->exportKeyframe(bk);
 		BoneGridInterpolator *interp = a->getBoneGridInterpolator(editingBone->boneIdx);
 		interp->updateGridAndBone(*bk, editingBone);
 	}
 }
+
diff --git a/Aquaria/AnimationEditor.h b/Aquaria/AnimationEditor.h
index c18ad74..e69e12a 100644
--- a/Aquaria/AnimationEditor.h
+++ b/Aquaria/AnimationEditor.h
@@ -8,6 +8,7 @@
 class DebugFont;
 class BitmapText;
 class SplineGrid;
+class DebugButton;
 
 class KeyframeWidget : public Quad
 {
@@ -133,6 +134,7 @@ public:
 	SkeletalKeyframe buffer;
 
 	bool editingStrip;
+	bool assistedSplineEdit;
 	size_t selectedStripPoint;
 
 	void reverseAnim();
@@ -162,6 +164,10 @@ public:
 	SplineGrid *splinegrid;
 	void applySplineGridToBone();
 	void applyBoneToSplineGrid();
+
+	void toggleSplineMode();
+	DebugButton *bSplineAssist;
+	void updateButtonLabels();
 };
 
 
diff --git a/BBGE/Interpolators.cpp b/BBGE/Interpolators.cpp
index 92ec687..f954b53 100644
--- a/BBGE/Interpolators.cpp
+++ b/BBGE/Interpolators.cpp
@@ -1,6 +1,5 @@
 #include "Interpolators.h"
 #include <math.h>
-#include "tbsp.hh"
 
 // usually one would expect that a bspline goes from t=0 to t=1.
 // here, splines eval between these -0.5 .. +0.5.
@@ -101,9 +100,10 @@ void BSpline2D::resize(size_t cx, size_t cy, unsigned degx, unsigned degy)
 
 void BSpline2D::recalc(Vector* dst, size_t xres, size_t yres, const Vector *controlpoints)
 {
+    const unsigned maxDeg = std::max(_degx, _degy);
+
     std::vector<Vector> tmpv;
-    size_t degn = std::max(_degx, _degy);
-    size_t tmpn = (yres * _cpx) + degn;
+    size_t tmpn = (yres * _cpx) + maxDeg;
     size_t tmpsz = tmpn * sizeof(Vector);
     Vector *tmp;
     if(tmpsz < 17*1024)
@@ -113,7 +113,9 @@ void BSpline2D::recalc(Vector* dst, size_t xres, size_t yres, const Vector *cont
         tmpv.resize(tmpn);
         tmp = &tmpv[0];
     }
-    Vector *work = tmp + (tmpn - degn);
+    // tmp[] layout: leftmost part: entries to hold the matrix as it's being built;
+    //               rightmost part: maxDeg entries as workmem for the deBoor eval
+    Vector *work = tmp + (tmpn - maxDeg);
 
     // Each column -> Y-axis interpolation
     for(size_t x = 0; x < _cpx; ++x)
@@ -162,3 +164,80 @@ void BSpline2DWithPoints::reset()
 {
     BSpline2D::reset(&controlpoints[0]);
 }
+
+bool BSpline2DControlPointGenerator::resize(size_t cx, size_t cy)
+{
+    const size_t interpStorageSizeX = tbsp__getInterpolatorStorageSize(cx, cx);
+    const size_t interpStorageSizeY = tbsp__getInterpolatorStorageSize(cy, cy);
+    const size_t interpStorageNeeded = interpStorageSizeX + interpStorageSizeY;
+    floats.resize(interpStorageNeeded);
+
+    cp2d.init(cx, cy);
+
+    const size_t maxcp = std::max(cx, cy);
+    vectmp.resize(maxcp);
+
+    return interp.x.init(&floats[0], cx, cx)
+        && interp.y.init(&floats[interpStorageSizeX], cy, cy);
+}
+
+bool BSpline2DControlPointGenerator::refresh(const float* knotsx, const float* knotsy, unsigned degx, unsigned degy)
+{
+    const size_t maxcp = vectmp.size();
+    const size_t tmpn = tbsp__getInterpolatorRefreshTempSize(maxcp, maxcp);
+    const size_t tmpsz = tmpn * sizeof(float);
+    float *tmp;
+    std::vector<float> tmpv;
+    if(tmpsz < 17*1024)
+        tmp = (float*)alloca(tmpsz);
+    else
+    {
+        tmpv.resize(tmpn);
+        tmp = &tmpv[0];
+    }
+
+    return interp.x.refresh(tmp, knotsx, degx)
+        && interp.y.refresh(tmp, knotsy, degy);
+}
+
+Vector* BSpline2DControlPointGenerator::generateControlPoints(const Vector *points2d)
+{
+    const size_t cpx = interp.x.getNumInputPoints();
+    const size_t cpy = interp.y.getNumInputPoints();
+
+    // y direction first
+    for(size_t x = 0; x < cpx; ++x)
+    {
+        const Vector *src = &points2d[x];
+        for(size_t y = 0; y < cpy; ++y, src += cpx)
+            vectmp[y] = *src;
+
+        // solve in-place
+        interp.y.generateControlPoints<Vector>(&vectmp[0], NULL, &vectmp[0]);
+
+        for(size_t y = 0; y < cpy; ++y)
+            cp2d(x, y) = vectmp[y];
+    }
+
+    // x direction
+    for(size_t y = 0; y < cpy; ++y)
+    {
+        Vector *row = cp2d.row(y);
+        // solve in-place
+        interp.x.generateControlPoints<Vector>(row, NULL, row);
+    }
+
+    return cp2d.data();
+}
+
+
+bool BSpline2DControlPointGeneratorWithPoints::resize(size_t cx, size_t cy)
+{
+    designpoints.resize(cx * cy);
+    return BSpline2DControlPointGenerator::resize(cx, cy);
+}
+
+Vector* BSpline2DControlPointGeneratorWithPoints::generateControlPoints()
+{
+    return BSpline2DControlPointGenerator::generateControlPoints(&designpoints[0]);
+}
diff --git a/BBGE/Interpolators.h b/BBGE/Interpolators.h
index f118d71..c098c97 100644
--- a/BBGE/Interpolators.h
+++ b/BBGE/Interpolators.h
@@ -4,6 +4,15 @@
 #include <algorithm> // std::pair
 #include <vector>
 #include "Vector.h"
+#include "tbsp.hh"
+#include "DataStructures.h"
+
+enum SplineType
+{
+	INTERPOLATOR_BSPLINE,
+	INTERPOLATOR_COSINE,
+	INTERPOLATOR_BSPLINE_EXT
+};
 
 class CosineInterpolator
 {
@@ -36,18 +45,20 @@ public:
     inline unsigned degX() const { return _degx; }
     inline unsigned degY() const { return _degy; }
 
+    inline const float *getKnotsX() const { return &knotsX[0]; }
+    inline const float *getKnotsY() const { return &knotsY[0]; }
+
 private:
+
     size_t _cpx, _cpy; // # of control points
     unsigned _degx, _degy;
     float _tmin, _tmax;
     std::vector<float> knotsX, knotsY;
 };
 
-
 class BSpline2DWithPoints : public BSpline2D
 {
 public:
-
     void resize(size_t cx, size_t cy, unsigned degx, unsigned degy);
     void recalc(Vector *dst, size_t xres, size_t yres);
 
@@ -61,4 +72,31 @@ public:
     }
 };
 
+class BSpline2DControlPointGenerator
+{
+public:
+    bool resize(size_t cx, size_t cy);
+
+    bool refresh(const float *knotsx, const float *knotsy, unsigned degx, unsigned degy);
+
+    Vector *generateControlPoints(const Vector *points2d);
+
+private:
+    Array2d<Vector> cp2d;
+    struct
+    {
+        tbsp::Interpolator<float> x, y;
+    } interp;
+    std::vector<float> floats;
+    std::vector<Vector> vectmp;
+};
+
+class BSpline2DControlPointGeneratorWithPoints : public BSpline2DControlPointGenerator
+{
+public:
+    bool resize(size_t cx, size_t cy);
+    Vector *generateControlPoints();
+    std::vector<Vector> designpoints;
+};
+
 #endif
diff --git a/BBGE/SkeletalSprite.cpp b/BBGE/SkeletalSprite.cpp
index d6ada54..989a365 100644
--- a/BBGE/SkeletalSprite.cpp
+++ b/BBGE/SkeletalSprite.cpp
@@ -1660,7 +1660,9 @@ void SkeletalSprite::loadSkeletal(const std::string &fn)
 		XMLElement *animation = animations->FirstChildElement("Animation");
 		while(animation)
 		{
-			Animation newAnimation;
+			this->animations.push_back(Animation());
+			Animation& newAnimation = this->animations.back();
+
 			newAnimation.name = animation->Attribute("name");
 			if(animation->Attribute("resetOnEnd"))
 				newAnimation.resetOnEnd = animation->BoolAttribute("resetOnEnd");
@@ -1786,8 +1788,15 @@ void SkeletalSprite::loadSkeletal(const std::string &fn)
 			}
 
 			// <Interpolator bone="name or idx" type="TYPE config and params" data="controlpoints; aded by editor" />
+			size_t numInterp = 0;
 			XMLElement *interp = animation->FirstChildElement("Interpolator");
 			for( ; interp; interp = interp->NextSiblingElement("Interpolator"))
+				++numInterp;
+
+			newAnimation.interpolators.resize(numInterp);
+
+			interp = animation->FirstChildElement("Interpolator");
+			for(numInterp = 0 ; interp; interp = interp->NextSiblingElement("Interpolator"), ++numInterp)
 			{
 				Bone *bi = NULL;
 				const char *sbone = interp->Attribute("bone");
@@ -1817,17 +1826,16 @@ void SkeletalSprite::loadSkeletal(const std::string &fn)
 					continue;
 				}
 
-				SplineType spline = SPLINE_BSPLINE;
+				SplineType spline = INTERPOLATOR_BSPLINE;
 				unsigned cx = 3, cy = 3, degx = 3, degy = 3;
 				if(const char *stype = interp->Attribute("type"))
 				{
 					SimpleIStringStream is(stype, SimpleIStringStream::REUSE);
 					std::string ty;
 					is >> ty;
-					BoneGridInterpolator bgip;
 					if(ty == "bspline")
 					{
-						spline = SPLINE_BSPLINE;
+						spline = INTERPOLATOR_BSPLINE;
 						if(!(is >> cx >> cy >> degx >> degy))
 						{
 							if(!degx)
@@ -1850,10 +1858,8 @@ void SkeletalSprite::loadSkeletal(const std::string &fn)
 				grid->gridType = GRID_INTERP;
 				// bone grid should have been created via <Bone grid=... /> earlier
 
-				const char *idata = interp->Attribute("data");
-				newAnimation.interpolators.push_back(BoneGridInterpolator());
-				BoneGridInterpolator& bgip = newAnimation.interpolators.back();
-				//bgip.type = spline;
+
+				BoneGridInterpolator& bgip = newAnimation.interpolators[numInterp];
 				bgip.idx = bi->boneIdx;
 				bgip.storeBoneByIdx = boneByIdx;
 
@@ -1864,38 +1870,40 @@ void SkeletalSprite::loadSkeletal(const std::string &fn)
 				const size_t numcp = size_t(cx) * size_t(cy);
 				const size_t numgridp = grid->linearsize();
 
-				// data format: "W H [x y x y ... (W*H times)] W H x y x y ..."
-				//               ^- start of 1st keyframe  ^- 2nd keyframe
-				SimpleIStringStream is(idata ? idata : "",  SimpleIStringStream::REUSE);
-
-				// fixup keyframes and recalc spline points
-				for(size_t k = 0; k < newAnimation.keyframes.size(); ++k)
+				if(const char *idata = interp->Attribute("data"))
 				{
-					SkeletalKeyframe& kf = newAnimation.keyframes[k];
-					BoneKeyframe *bk = kf.getBoneKeyframe(bgip.idx);
+					// data format: "W H [x y x y ... (W*H times)] W H x y x y ..."
+					//               ^- start of 1st keyframe  ^- 2nd keyframe
+					SimpleIStringStream is(idata,  SimpleIStringStream::REUSE);
 
-					bk->controlpoints.resize(numcp);
-					bgip.bsp.reset(&bk->controlpoints[0]);
+					// fixup keyframes and recalc spline points
+					for(size_t k = 0; k < newAnimation.keyframes.size(); ++k)
+					{
+						SkeletalKeyframe& kf = newAnimation.keyframes[k];
+						BoneKeyframe *bk = kf.getBoneKeyframe(bgip.idx);
 
-					unsigned w = 0, h = 0;
-					Vector cp;
-					cp.z = 1; // we want all grid points at full alpha
+						bk->controlpoints.resize(numcp);
+						bgip.bsp.reset(&bk->controlpoints[0]);
 
-					if((is >> w >> h))
-						for(unsigned y = 0; y < h; ++y)
-							for(unsigned x = 0; x < w; ++x)
-								if((is >> cp.x >> cp.y))
-									if(x < cx && y < cy)
-										bk->controlpoints[y*size_t(cx) + x] = cp;
+						unsigned w = 0, h = 0;
+						Vector cp;
+						cp.z = 1; // we want all grid points at full alpha
 
-					bk->grid.resize(numgridp);
-					bgip.updateGridOnly(*bk, bi);
+						if((is >> w >> h))
+							for(unsigned y = 0; y < h; ++y)
+								for(unsigned x = 0; x < w; ++x)
+									if((is >> cp.x >> cp.y))
+										if(x < cx && y < cy)
+											bk->controlpoints[y*size_t(cx) + x] = cp;
+
+						bk->grid.resize(numgridp);
+						bgip.updateGridOnly(*bk, bi);
+					}
 				}
 				// ---- end bspline -----
 			}
 
 			animation = animation->NextSiblingElement("Animation");
-			this->animations.push_back(newAnimation);
 		}
 	}
 }
diff --git a/BBGE/SplineGrid.cpp b/BBGE/SplineGrid.cpp
index ccbed40..1de9cd2 100644
--- a/BBGE/SplineGrid.cpp
+++ b/BBGE/SplineGrid.cpp
@@ -1,7 +1,12 @@
 #include "SplineGrid.h"
+
+#include <assert.h>
+
 #include "RenderBase.h"
 #include "Core.h"
 #include "RenderGrid.h"
+#include "SkeletalSprite.h"
+
 
 SplineGridCtrlPoint *SplineGridCtrlPoint::movingPoint;
 
@@ -74,11 +79,12 @@ void SplineGridCtrlPoint::onUpdate(float dt)
 }
 
 SplineGrid::SplineGrid()
-    : wasModified(false), deg(0), pointscale(1)
+    : wasModified(false), deg(0), pointscale(1), _assistMode(true)
 {
     setWidthHeight(128, 128);
     renderQuad = true;
     renderBorder = true;
+    renderBorderColor = Vector(0.5f, 0.5f, 0.5f);
 }
 
 SplineGrid::~SplineGrid()
@@ -87,6 +93,9 @@ SplineGrid::~SplineGrid()
 
 DynamicRenderGrid *SplineGrid::resize(size_t w, size_t h, size_t xres, size_t yres, unsigned degx, unsigned degy)
 {
+    if(!cpgen.resize(w, h))
+        return NULL;
+
     size_t oldcpx = bsp.ctrlX();
     size_t oldcpy = bsp.ctrlY();
 
@@ -126,6 +135,9 @@ DynamicRenderGrid *SplineGrid::resize(size_t w, size_t h, size_t xres, size_t yr
                 ref = createControlPoint(x, y);
         }
 
+    if(!cpgen.refresh(bsp.getKnotsX(), bsp.getKnotsY(), bsp.degX(), bsp.degY()))
+        return NULL;
+
     recalc();
 
     return ret;
@@ -133,7 +145,17 @@ DynamicRenderGrid *SplineGrid::resize(size_t w, size_t h, size_t xres, size_t yr
 
 void SplineGrid::recalc()
 {
-    exportControlPoints(&bsp.controlpoints[0]);
+    if(_assistMode)
+    {
+        exportGridPoints(&cpgen.designpoints[0]);
+        _generateControlPointsFromDesignPoints();
+    }
+    else
+    {
+        exportGridPoints(&bsp.controlpoints[0]);
+        bsp.recalc(&cpgen.designpoints[0], bsp.ctrlX(), bsp.ctrlY());
+    }
+
     if(grid)
     {
         bsp.recalc(grid->dataRW(), grid->width(), grid->height());
@@ -141,24 +163,67 @@ void SplineGrid::recalc()
     }
 }
 
-void SplineGrid::exportControlPoints(Vector* controlpoints)
+void SplineGrid::exportGridPoints(Vector* pdst) const
 {
     for(size_t i = 0; i < ctrlp.size(); ++i)
-        controlpoints[i] = ctrlp[i]->getSplinePosition();
+        pdst[i] = ctrlp[i]->getSplinePosition();
 }
 
-void SplineGrid::importControlPoints(const Vector* controlpoints)
+void SplineGrid::importGridPoints(const Vector* psrc)
 {
     for(size_t i = 0; i < ctrlp.size(); ++i)
-        ctrlp[i]->setSplinePosition(controlpoints[i]);
+        ctrlp[i]->setSplinePosition(psrc[i]);
+}
+
+void SplineGrid::importKeyframe(const BoneKeyframe* bk)
+{
+    const size_t numcp = bsp.ctrlX() * bsp.ctrlY();
+    assert(bk->controlpoints.size() == numcp);
+
+    bsp.controlpoints = bk->controlpoints;
+
+    if(_assistMode)
+    {
+        // given control points, generate spline points (which are later caculated back into control points)
+        bsp.recalc(&cpgen.designpoints[0], bsp.ctrlX(), bsp.ctrlY());
+        importGridPoints(&cpgen.designpoints[0]);
+    }
+    else
+        importGridPoints(&bk->controlpoints[0]);
+
     recalc();
 }
 
+void SplineGrid::exportKeyframe(BoneKeyframe* bk) const
+{
+    const size_t numcp = bsp.ctrlX() * bsp.ctrlY();
+    assert(bk->controlpoints.size() == numcp);
+
+    bk->controlpoints = bsp.controlpoints;
+}
 
 void SplineGrid::resetControlPoints()
 {
     bsp.reset();
-    importControlPoints(&bsp.controlpoints[0]);
+
+    importGridPoints(&bsp.controlpoints[0]);
+
+    // This pushes the bspline controlpoints outwards so that all spline points line up as one would expect.
+    // If this weren't done, the tile's texture would be pulled inwards (more with increasing dimension);
+    // as if the tile was a piece of plastic foil that's seen too much heat.
+    //if(_assistMode) // ALWAYS DO THIS!!
+    {
+        cpgen.designpoints = bsp.controlpoints;
+        _generateControlPointsFromDesignPoints();
+    }
+
+    recalc();
+}
+
+void SplineGrid::_generateControlPointsFromDesignPoints()
+{
+    const Vector *cp = cpgen.generateControlPoints();
+    memcpy(&bsp.controlpoints[0], cp, bsp.controlpoints.size() * sizeof(*cp));
 }
 
 SplineGridCtrlPoint* SplineGrid::createControlPoint(size_t x, size_t y)
@@ -190,7 +255,10 @@ void SplineGrid::onRender(const RenderState& rs) const
     const Vector wh2(width * 0.5f, height * 0.5f);
 
     glLineWidth(2);
-    glColor4f(0.0f, 0.3f, 1.0f, 0.3f);
+    if(_assistMode)
+        glColor4f(0.0f, 0.3f, 1.0f, 0.4f);
+    else
+        glColor4f(0.0f, 0.0f, 0.0f, 0.4f);
 
     const size_t cpx = bsp.ctrlX();
     const size_t cpy = bsp.ctrlY();
@@ -219,6 +287,45 @@ void SplineGrid::onRender(const RenderState& rs) const
         }
         glEnd();
     }
+
+    const Vector *psrc = _assistMode
+        ? &bsp.controlpoints[0]
+        : &cpgen.designpoints[0];
+
+    if(RenderObject::renderCollisionShape)
+    {
+        glLineWidth(1);
+        glColor4f(1.0f, 0.3f, 0.3f, 0.7f);
+        glPushMatrix();
+        glScalef(width, height, 1);
+
+        // X axis
+        for(size_t y = 0; y < cpy; ++y)
+        {
+            glBegin(GL_LINE_STRIP);
+            const Vector *row = &psrc[y * cpx];
+            for(size_t x = 0; x < cpx; ++x)
+            {
+                const Vector p = row[x];
+                glVertex2f(p.x, p.y);
+            }
+            glEnd();
+        }
+
+        // Y axis
+        for(size_t x = 0; x < cpx; ++x)
+        {
+            glBegin(GL_LINE_STRIP);
+            for(size_t y = 0; y < cpy; ++y)
+            {
+                const Vector p = psrc[y * cpx + x];
+                glVertex2f(p.x, p.y);
+            }
+            glEnd();
+        }
+
+        glPopMatrix();
+    }
 }
 
 void SplineGrid::setPointScale(const float scale)
@@ -230,3 +337,17 @@ void SplineGrid::setPointScale(const float scale)
         ctrlp[i]->scale.y = scale;
     }
 }
+
+void SplineGrid::setAssist(bool on)
+{
+    if(on == _assistMode)
+        return;
+
+    if(on)
+        importGridPoints(&cpgen.designpoints[0]);
+    else
+        importGridPoints(&bsp.controlpoints[0]);
+
+    _assistMode = on;
+    recalc();
+}
diff --git a/BBGE/SplineGrid.h b/BBGE/SplineGrid.h
index 33106da..eb7045a 100644
--- a/BBGE/SplineGrid.h
+++ b/BBGE/SplineGrid.h
@@ -7,11 +7,8 @@
 #include "Quad.h"
 #include "Interpolators.h"
 
-enum SplineType
-{
-	SPLINE_BSPLINE,
-	SPLINE_COSINE,
-};
+
+class BoneKeyframe;
 
 class SplineGridCtrlPoint : public Quad
 {
@@ -36,13 +33,22 @@ public:
 	// # of control points on each axis
 	DynamicRenderGrid *resize(size_t w, size_t h, size_t xres, size_t yres, unsigned degx, unsigned degy);
 	void recalc();
-	void exportControlPoints(Vector *controlpoints);
-	void importControlPoints(const Vector *controlpoints);
+
+	// Export/import grid points; depending on the mode these either correspond directly to control points
+	// or to spline points from which the control points need to be calculated first (using cpgen)
+	void exportGridPoints(Vector *pdst) const;
+	void importGridPoints(const Vector *psrc);
+
+	void importKeyframe(const BoneKeyframe *bk);
+	void exportKeyframe(BoneKeyframe *bk) const;
+
 	void resetControlPoints();
 
 	void setPointScale(const float scale);
 	float getPointScale() const { return pointscale; }
 
+	void setAssist(bool on);
+
 
 	virtual void onRender(const RenderState& rs) const OVERRIDE;
 	virtual void onUpdate(float dt) OVERRIDE;
@@ -53,6 +59,7 @@ public:
 	bool wasModified; // to be checked/reset by external code
 
 private:
+	void _generateControlPointsFromDesignPoints();
 
 	SplineGridCtrlPoint *createControlPoint(size_t x, size_t y);
 
@@ -60,6 +67,9 @@ private:
 	unsigned deg;
 	BSpline2DWithPoints bsp;
 	float pointscale;
+
+	BSpline2DControlPointGeneratorWithPoints cpgen;
+	bool _assistMode;
 };
 
 
diff --git a/ExternalLibs/tbsp.hh b/ExternalLibs/tbsp.hh
index bacec09..a94f9e0 100644
--- a/ExternalLibs/tbsp.hh
+++ b/ExternalLibs/tbsp.hh
@@ -1,65 +1,125 @@
-/* Tiny B-spline evaluation library
+/* Tiny B-spline evaluation and interpolation library
 
 License:
 Public domain, WTFPL, CC0 or your favorite permissive license; whatever is available in your country.
 
-Dependencies:
-- Requires C++98 without libc
-
 Origin:
 https://github.com/fgenesis/tinypile
 
+This library is split into two parts:
+- (1) Bspline evaluation (given a set of control points, generate points along the spline)
+- (2) Bspline interpolation (given some points, generate control points so that the resulting spline
+      goes through these points. Needs part (1) to function.)
 
---- Example usage: ---
+Requirements:
+- Part (1) has zero dependencies (not even libc)
+- Part (2) requires sqrt() from the libc. Change TBSP_SQRT() to use your own.
+
+Design notes:
+- C++98 compatible. Stand-alone.
+- No memory allocation; all memory is caller-controlled and caller-owned.
+  In cases where this is needed, the caller needs to pass appropriately-sized working memory.
+
+
+This is a template library and your types must fulfil certain criteria:
+
+- Scalar (T): Same semantics as float or double. Should be POD. Best use float.
+
+- Point (P): Interpolated type. Must support the following operators:
+     Point operator+(const Point& o) const    // Element addition
+     Point operator-(const Point& o) const    // Element subtraction
+     Point& operator+=(const Point& o)        // Add to self
+     Point& operator-=(const Point& o)        // Subtract from self
+     Point operator*(Scalar m) const          // Scalar multiplication
+     Point& operator=(const Point& o)         // Assignment
+    No constructors are required and the code does not need the type to be zero-inited.
+
+
+--- Example usage, Bspline evaluation: ---
 
 enum { DEGREE = 3 }; // Cubic
-// You can interpolate anything as long as it supports element addition and scalar multiplication
-struct Point { ... operator+(Point) and operator*(float) overloaded ... };
-const Point ps[N] = {...}; // Control points for the spline
+const Point cp[NCP] = {...}; // Control points for the spline
 
-float knots[tbsp__getNumKnots(N, DEGREE)]; // knot vector; used for evaluating the spline
+float knots[tbsp__getNumKnots(NCP, DEGREE)]; // knot vector; used for evaluating the spline
 Point tmp[DEGREE]; // Temporary working memory must be provided by the caller.
                    // This is just a tiny array. Must have as many elements as the degree of the spline.
 
 // This must be done once for each B-spline; the spline is then defined by the knot vector.
 // In particular, this inits a knot vector with end points [L..R],
 // ie. the spline will interpolate values for t = [L..R].
-// (You can use any boundary values, eg. [-4..+5], but [0..1] is the most common)
-tbsp::fillKnotVector(knots, N, DEGREE, L, R);
+// (You can use any boundary values L < R, eg. [-4..+5], but [0..1] is the most common)
+// Note that this depends only on the number of the control points, but not their values.
+// This means you only need to compute this when NCP changes!
+tbsp::fillKnotVector(knots, NCP, DEGREE, L, R);
 
 // Evaluate the spline at point t
-// Returns ps[0] if t <= L; ps[N-1] if t >= R; otherwise an interpolated point
-Point p = tbsp::evalOne(tmp, knots, ps, N, DEGREE, t);
+// Returns cp[0] if t <= L; cp[NCP-1] if t >= R; otherwise an interpolated point
+Point p = tbsp::evalOne(tmp, knots, cp, NCP, DEGREE, t);
 
-// Evaluate A points between t=0.2 .. t=0.5, equidistantly spaced, and write to a[].
-// (If you have multiple points to evaluate, this is faster than multiple evalOne() calls)
-Point a[A];
-tbsp::evalRange(out, A, tmp, knots, ps, N, DEGREE, 0.2f, 0.5f);
+// Evaluate NP points between t=0.2 .. t=0.5, equidistantly spaced, and write to p[].
+// (If you have multiple points to evaluate, this is much faster than multiple evalOne() calls)
+Point p[NP];
+tbsp::evalRange(p, NP, tmp, knots, cp, NCP, DEGREE, 0.2f, 0.5f);
 */
 
 #pragma once
 
 #include <stddef.h> // size_t
 
+// ---- Compile-time config ------
 
-#ifndef TBSP_ASSERT
-#  include <assert.h>
-#  define TBSP_ASSERT(x) assert(x)
+
+#ifndef TBSP_SQRT // Needed for part (2) only
+#  include <math.h>
+#  define TBSP_SQRT(x) sqrt(x)
 #endif
 
-// ---- B-Spline eval part begin ----
+#if !defined(NDEBUG) || defined(_DEBUG) || defined(DEBUG)
+#  ifndef TBSP_ASSERT
+#    include <assert.h>
+#    define TBSP_ASSERT(x) assert(x)
+#  endif
+#endif
+
+
+
+// ---- Generic defines and stuff we need ----
+
+
+// Should be constexpr, but we want to stay C++98-compatible
+#define tbsp__getNumKnots(numControlPoints, degree) ((numControlPoints) + (degree) + 1)
+
+#ifndef TBSP_ASSERT
+#  define TBSP_ASSERT(x)
+#endif
+
+#ifdef _MSC_VER
+#  define TBSP_RESTRICT __restrict
+#else
+#  define TBSP_RESTRICT restrict
+#endif
+
+#ifndef TBSP_HAS_CPP11
+#  if (__cplusplus > 201103L) || (defined(_MSC_VER) && ((_MSC_VER+0) >= 1900))
+#    define TBSP_HAS_CPP11 1
+#  else
+#    define TBSP_HAS_CPP11 0
+#  endif
+#endif
+
+// -----------------------------------------------------------------------------------
+// ---- Part (1) begin: B-Spline eval ----
+// Given some control points, calculate points along the spline.
+// -----------------------------------------------------------------------------------
+
 
 namespace tbsp {
 
-// These should be constexpr, but we want to stay C++98-compatible
-#define tbsp__getNumKnots(points, degree) ((points) + (degree) + 1)
-#define tbsp__getKnotVectorAllocSize(K, points, degree) (sizeof(K) * tbsp__getNumKnots((points), (degree)))
-
 namespace detail {
 
-// returns index of first element strictly less than t
-template<typename K>
-static size_t findKnotIndexOffs(K val, const K *p, size_t n)
+// returns index of first element strictly less than val
+template<typename T>
+static size_t findKnotIndexOffs(T val, const T *p, size_t n)
 {
     // Binary search to find leftmost element that is < val
     size_t L = 0;
@@ -73,17 +133,21 @@ static size_t findKnotIndexOffs(K val, const K *p, size_t n)
         else
             R = m;
     }
+    // FIXME: can we just return m at this point?
+    if(L && !(p[L] < val))
+        --L;
+    TBSP_ASSERT(!L || p[L] < val);
     return L;
 }
 
-template<typename K>
-static inline size_t findKnotIndex(K val, const K *knots, size_t n, size_t degree)
+template<typename T>
+static inline size_t findKnotIndex(T val, const T *knots, size_t numknots, size_t degree)
 {
-    TBSP_ASSERT(n > degree);
-    TBSP_ASSERT(val < knots[n - degree - 1]); // beyond right end? should have been caught by caller
+    TBSP_ASSERT(numknots > degree);
+    TBSP_ASSERT(val < knots[numknots - degree - 1]); // beyond right end? should have been caught by caller
 
     // skip endpoints
-    return degree + findKnotIndexOffs(val, knots + degree, n - degree);
+    return degree + findKnotIndexOffs(val, knots + degree, numknots - (degree * 2u));
 }
 
 template<typename K>
@@ -94,10 +158,10 @@ static void genKnotsUniform(K *knots, size_t nn, K mink, K maxk)
         knots[i] = mink + K(i+1) * m;
 }
 
-template<typename K, typename T>
-static T deBoor(T *work, const T *src, const K *knots, const size_t r, const size_t k, const K t, size_t inputStride)
+template<typename T, typename P>
+static P deBoor(P * TBSP_RESTRICT work, const P * TBSP_RESTRICT src, const T * TBSP_RESTRICT knots, const size_t r, const size_t k, const T t, size_t inputStride)
 {
-    T last = src[0]; // init so that it works correctly even with degree == 0
+    P last = src[0]; // init so that it works correctly even with degree == 0
     for(size_t worksize = k; worksize > 1; --worksize)
     {
         const size_t j = k - worksize + 1; // iteration number, starting with 1, going up to k
@@ -105,12 +169,12 @@ static T deBoor(T *work, const T *src, const K *knots, const size_t r, const siz
         for(size_t w = 0, wr = 0; w < worksize - 1; ++w, wr += inputStride)
         {
             const size_t i = w + tmp;
-            const K ki = knots[i];
+            const T ki = knots[i];
             TBSP_ASSERT(ki <= t);
-            const K div = knots[i+k-j] - ki;
+            const T div = knots[i+k-j] - ki;
             TBSP_ASSERT(div > 0);
-            const K a = (t - ki) / div;
-            const K a1 = K(1) - a;
+            const T a = (t - ki) / div;
+            const T a1 = T(1) - a;
             work[w] = last = (src[wr] * a1) + (src[wr + inputStride] * a); // lerp
         }
         src = work; // done writing the initial data to work, now use that as input for further iterations
@@ -122,11 +186,13 @@ static T deBoor(T *work, const T *src, const K *knots, const size_t r, const siz
 } // end namespace detail
 //--------------------------------------
 
-template<typename K>
-static size_t fillKnotVector(K *knots, size_t points, size_t degree, K mink, K maxk)
+template<typename T>
+static size_t fillKnotVector(T *knots, size_t numcp, size_t degree, T mink, T maxk)
 {
-    const size_t n = points - 1;
-    if(n < degree) // lower degree if not enough points
+    TBSP_ASSERT(mink < maxk);
+
+    const size_t n = numcp - 1;
+    if(n < degree) // lower degree if not enough control points
         degree = n;
     TBSP_ASSERT(n >= degree);
 
@@ -149,17 +215,17 @@ static size_t fillKnotVector(K *knots, size_t points, size_t degree, K mink, K m
 }
 
 // evaluate single point at t
-template<typename K, typename T>
-static T evalOne(T *work, const K *knots, const T *points, size_t numpoints, size_t degree, K t)
+template<typename T, typename P>
+static P evalOne(P * TBSP_RESTRICT work, const T * TBSP_RESTRICT knots, const P * TBSP_RESTRICT controlpoints, size_t numcp, size_t degree, T t, size_t inputStride = 1)
 {
     if(t < knots[0])
-        return points[0]; // left out-of-bounds
+        return controlpoints[0]; // left out-of-bounds
 
-    if(numpoints - 1 < degree)
-        degree = numpoints - 1;
+    if(numcp - 1 < degree)
+        degree = numcp - 1;
 
-    const size_t numknots = tbsp__getNumKnots(numpoints, degree);
-    const K maxknot = knots[numknots - 1];
+    const size_t numknots = tbsp__getNumKnots(numcp, degree);
+    const T maxknot = knots[numknots - 1];
     if(t < maxknot)
     {
         const size_t r = detail::findKnotIndex(t, knots, numknots, degree);
@@ -167,29 +233,29 @@ static T evalOne(T *work, const K *knots, const T *points, size_t numpoints, siz
         const size_t k = degree + 1;
         TBSP_ASSERT(r + k < numknots); // check that the copy below stays in bounds
 
-        const T* const src = &points[r - degree];
-        return detail::deBoor(work, src, knots, r, k, t);
+        const P* const src = &controlpoints[r - degree];
+        return detail::deBoor(work, src, knots, r, k, t, inputStride);
     }
 
-    return points[numpoints - 1]; // right out-of-bounds
+    return controlpoints[numcp - 1]; // right out-of-bounds
 }
 
 // evaluate numdst points in range [tmin..tmax], equally spaced
-template<typename K, typename T>
-static void evalRange(T *dst, size_t numdst, T *work, const K *knots, const T *points, size_t numpoints, size_t degree, K tmin, K tmax, size_t inputStride = 1, size_t outputStride = 1)
+template<typename T, typename P>
+static void evalRange(P * TBSP_RESTRICT dst, size_t numdst, P * TBSP_RESTRICT work, const T * TBSP_RESTRICT knots, const P * TBSP_RESTRICT controlpoints, size_t numcp, size_t degree, T tmin, T tmax, size_t inputStride = 1, size_t outputStride = 1)
 {
     TBSP_ASSERT(tmin <= tmax);
-    if(numpoints - 1 < degree)
-        degree = numpoints - 1;
+    if(numcp - 1 < degree)
+        degree = numcp - 1;
 
-    const size_t numknots = tbsp__getNumKnots(numpoints, degree);
+    const size_t numknots = tbsp__getNumKnots(numcp, degree);
     size_t r = detail::findKnotIndex(tmin, knots, numknots, degree);
     TBSP_ASSERT(r >= degree);
     const size_t k = degree + 1;
     TBSP_ASSERT(r + k < numknots); // check that the copy below stays in bounds
 
-    const K step = (tmax - tmin) / K(numdst - 1);
-    K t = tmin;
+    const T step = (tmax - tmin) / T(numdst - 1);
+    T t = tmin;
     const size_t maxidx = numknots - k;
 
 
@@ -198,18 +264,18 @@ static void evalRange(T *dst, size_t numdst, T *work, const K *knots, const T *p
     // left out-of-bounds
     for( ; i < numdst && t < knots[0]; ++i, t += step)
     {
-        *dst = points[0];
+        *dst = controlpoints[0];
         dst += outputStride;
     }
 
     // actually interpolated points
-    const K maxknot = knots[numknots - 1];
+    const T maxknot = knots[numknots - 1];
     for( ; i < numdst && t < maxknot; ++i, t += step)
     {
         while(r < maxidx && knots[r+1] < t) // find new index; don't need to do binary search again
             ++r;
 
-        const T* const src = &points[(r - degree) * inputStride];
+        const P * const src = &controlpoints[(r - degree) * inputStride];
         *dst = detail::deBoor(work, src, knots, r, k, t, inputStride);
         dst += outputStride;
     }
@@ -217,14 +283,712 @@ static void evalRange(T *dst, size_t numdst, T *work, const K *knots, const T *p
     // right out-of-bounds
     if(i < numdst)
     {
-        T last =  points[(numpoints - 1) * inputStride];
-        for( ; i < numdst; ++i)
+        const P p = controlpoints[(numcp - 1) * inputStride];
+        do
         {
-            *dst = last;
+            *dst = p;
             dst += outputStride;
+            ++i;
+        }
+        while(i < numdst);
+    }
+}
+
+// -----------------------------------------------------------------------------------
+// --- Part (2) begin: B-spline interpolation ----
+// Given some points that should lie on a spline, calculate its control points
+// -----------------------------------------------------------------------------------
+
+namespace detail {
+
+struct Size2d
+{
+    size_t w, h;
+
+    inline bool operator==(const Size2d& o) const { return w == o.w && h == o.h; }
+};
+
+// Wrap any pointer to access it like a vector (with proper bounds checking)
+template<typename T>
+class VecAcc
+{
+public:
+    typedef T value_type;
+    VecAcc(T *p, const size_t n)
+        : p(p), n(n) {}
+
+    inline T& operator[](size_t i)
+    {
+        TBSP_ASSERT(i < n);
+        return p[i];
+    }
+    inline const T& operator[](size_t i) const
+    {
+        TBSP_ASSERT(i < n);
+        return p[i];
+    }
+    template<typename V>
+    VecAcc& operator=(const V& o)
+    {
+        TBSP_ASSERT(n == o.size());
+        for(size_t i = 0; i < n; ++i)
+            p[i] = o[i];
+        return *this;
+    }
+    VecAcc& operator=(const VecAcc<T>& o) // Required for C++11
+    {
+        TBSP_ASSERT(n == o.size());
+        for(size_t i = 0; i < n; ++i)
+            p[i] = o[i];
+        return *this;
+    }
+    template<typename V>
+    VecAcc& operator+=(const V& o)
+    {
+        TBSP_ASSERT(n == o.size());
+        for(size_t i = 0; i < n; ++i)
+            p[i] += o[i];
+        return *this;
+    }
+    template<typename V>
+    VecAcc& operator-=(const V& o)
+    {
+        TBSP_ASSERT(n == o.size());
+        for(size_t i = 0; i < n; ++i)
+            p[i] -= o[i];
+        return *this;
+    }
+
+    T dot(VecAcc<T>& o) const // vector dot product
+    {
+        T s = 0;
+        const size_t n = this->n;
+        tbsp__ASSERT(n == o.n);
+        for(size_t i = 0; i < n; ++i)
+            s += a.p[i] * b.p[i];
+        return s;
+    }
+
+    inline size_t size() const { return n; }
+    inline       T *data()       { return p; }
+    inline const T *data() const { return p; }
+
+    T *p;
+    size_t n;
+};
+
+// Wraps any pointer to access it like a row matrix.
+// Intentionally a dumb POD struct, do NOT put ctors!
+template<typename T>
+struct MatrixAcc
+{
+    typedef T value_type;
+    typedef VecAcc<T> RowAcc;
+
+    inline T& operator()(size_t x, size_t y)
+    {
+        TBSP_ASSERT(x < size.w);
+        TBSP_ASSERT(y < size.h);
+        return p[y * size.w + x];
+    }
+    inline const T& operator()(size_t x, size_t y) const
+    {
+        TBSP_ASSERT(x < size.w);
+        TBSP_ASSERT(y < size.h);
+        return p[y * size.w + x];
+    }
+
+    inline T *rowPtr(size_t y) const
+    {
+        return p + (y * size.w);
+    }
+
+    // w,h of (this * o)
+    inline Size2d multSize(const MatrixAcc<T>& o) const
+    {
+        // in math notation: (n x m) * (m x p) -> (n x p)
+        // and because math is backwards:
+        // (h x w) * (H x W) -> (h x W)
+        TBSP_ASSERT(size.w == o.size.h);
+        Size2d wh { o.size.w, size.h };
+        return wh;
+    }
+
+    inline RowAcc row(size_t y) const
+    {
+        TBSP_ASSERT(y < size.h);
+        return RowAcc(p + (y * size.w), size.w);
+    }
+
+    T *p;
+    Size2d size;
+};
+
+// Cut away the borders from A, so that the new matrix A' size is (A.size.w - 2, A.size.h - 2):
+//     ( xxxxx )  (where x means cut and any other letter means keep)
+//     ( xabcx )        ( abc )
+// A = ( xdefx ),  A' = ( def )
+//     ( xxxxx )
+// then compute T(A') x A', ie.
+//     ( ad )   ( abc )
+// R = ( be ) X ( def )
+//     ( cf )
+// R must be already the correct size, ie. (A.size.w - 2, A.size.w - 2)!
+template<typename T>
+static void matMultCenterCutTransposeWithSelf(MatrixAcc<T>& TBSP_RESTRICT R, const MatrixAcc<T>& TBSP_RESTRICT A)
+{
+    TBSP_ASSERT(A.size.w > 2 && A.size.h > 2);
+
+    const size_t w = A.size.w - 2; // also the final size of R: (w x w);
+    const size_t h = A.size.h - 2; // when iterating over w/h it stops before the last element in that row/col
+    TBSP_ASSERT(R.size.w == w && R.size.h == w);
+
+    T *out = R.p;
+    for (size_t y = 0; y < w; ++y)
+    {
+        for (size_t x = 0; x < w; ++x)
+        {
+            const T *row = A.p + y + 1; // skip first elem (+1)
+            const T *col = A.p + x + w; // skip first row (+w)
+            T temp = 0;
+            for (size_t k = 0; k < h; ++k, row += w, col += w)
+                temp += *row * *col;
+            *out++ = temp;
         }
     }
 }
 
 
+// Linear system solver via Cholesky decomposition
+template<typename T>
+struct Cholesky
+{
+    typedef T value_type;
+    typedef MatrixAcc<T> Mat;
+
+    // Initializes a solver in the given memory.
+    // On success, bumps ptr forward by the memory it needs;
+    // on failure, returns NULL.
+    // Memory needed: (sizeof(T) * ((A.size.w + 1) * A.size.h))
+    T *init(T *mem, Size2d size)
+    {
+        T * const pdiag = mem + (size.w * size.h);
+        T * const myend = pdiag + size.w;
+
+        L.p = mem;
+        L.size = size;
+        idiag = pdiag; // T[n]
+        return myend;
+    }
+
+    // Sets the matrix A used for solving later. The solver is preconditioned
+    // for that matrix so that in (A * x + b)  b is the only variable.
+    // May fail if A is not well formed.
+    bool refresh(const Mat& A)
+    {
+        TBSP_ASSERT(A.size == L.size);
+        TBSP_ASSERT(A.size.w >= A.size.h);
+
+        const size_t n = L.size.w;
+
+        // Fill the lower left triangle, storing the diagonal separately
+        for(size_t y = 0; y < n; ++y)
+        {
+            const typename Mat::RowAcc rrow = A.row(y);
+            typename Mat::RowAcc wrow = L.row(y);
+            for(size_t x = y; x < n; ++x)
+            {
+                T s = rrow[x];
+                for(size_t k = 0; k < y; ++k)
+                    s -= wrow[k] * L(k,x);
+                if(x != y)
+                    L(y,x) = s * idiag[y];
+                else if(s > 0)
+                    idiag[y] = T(1) / (wrow[y] = TBSP_SQRT(s));
+                else
+                {
+                    TBSP_ASSERT(0 && "Cholesky decomposition failed");
+                    return false;
+                }
+            }
+        }
+
+        // Fill the upper right triangle with zeros
+        const T zero = T(0);
+        for(size_t y = 0; y < n; ++y)
+        {
+            typename Mat::RowAcc row = L.row(y);
+            for(size_t x = y+1; x < n; ++x)
+                row[x] = zero;
+        }
+
+        return true;
+    }
+
+    // Solves x for A * x + b. Both b and x must have length n.
+    // Can solve in-place, ie. you may pass xv == bv.
+    template<typename P>
+    void solve(P * const xv, const P * const bv) const
+    {
+        const size_t n = L.size.w;
+
+        for(size_t y = 0; y < n; ++y)
+        {
+            const typename Mat::RowAcc Lrow = L.row(y);
+            P p = bv[y];
+            for(size_t x = 0; x < y; ++x)
+                p -= xv[x] * Lrow[x];
+            xv[y] = p * idiag[y];
+        }
+        for(size_t y = n; y--; )
+        {
+            P p = xv[y];
+            for(size_t x = y+1; x < n; ++x)
+                p -= xv[x] * L(y,x);
+            xv[y] = p * idiag[y];
+        }
+    }
+
+    Mat L; // lower left triangle matrix
+    T *idiag; // 1 / diag; length is L.size.w
+};
+
+// Linear system solver via LU decomposition
+// (Slower than Cholesky, but a bit more general ie. applicable to more cases)
+template<typename T>
+struct LUDecomp
+{
+    typedef T value_type;
+    typedef MatrixAcc<T> Mat;
+
+    // Initializes a solver in the given memory.
+    // On success, bumps ptr forward by the memory it needs;
+    // on failure, returns NULL.
+    // Memory needed: (sizeof(T) * ((A.size.w + 1) * A.size.h))
+    T *init(T *mem, Size2d size)
+    {
+        T * const pdiag = mem + (size.w * size.h);
+        T * const myend = pdiag + size.w;
+
+        TBSP_ASSERT(size.w == size.h);
+
+        LU.p = mem;
+        LU.size = size;
+        idiag = pdiag;
+        return myend;
+    }
+
+    // Sets the matrix A used for solving later. The solver is preconditioned
+    // for that matrix so that in (A * x + b)  b is the only variable.
+    void refresh(const Mat& A)
+    {
+        TBSP_ASSERT(A.size == LU.size);
+
+        const size_t n = LU.size.w;
+
+        // Note: This doesn't do pivoting.
+        for (size_t y = 0; y < n; ++y)
+        {
+            for (size_t x = y; x < n; ++x)
+            {
+                T e = A(x, y);
+                for (size_t k = 0; k < y; ++k)
+                    e -= LU(k, y) * LU(x, k);
+                LU(x, y) = e;
+            }
+            const T idiagval = T(1) / LU(y,y);
+            idiag[y] = idiagval;
+            for (size_t x = y + 1; x < n; ++x)
+            {
+                T e = A(y,x);
+                for (size_t k = 0; k < y; ++k)
+                    e -= LU(k, x) * LU(y, k);
+                LU(y, x) = idiagval * e;
+            }
+        }
+    }
+
+    // Solves x for A * x + b. Both b and x must have length n.
+    // Can solve in-place, ie. you may pass xv == bv.
+    template<typename P>
+    void solve(P * const xv, const P * const bv) const
+    {
+        const size_t n = LU.size.w;
+
+        for (size_t y = 0; y < n; ++y)
+        {
+            P p = bv[y];
+            const typename Mat::RowAcc LUrow = LU.row(y);
+            for (size_t x = 0; x < y; ++x)
+                p -= xv[x] * LUrow[x];
+            xv[y] = p;
+        }
+        for (size_t y = n; y--; )
+        {
+            P p = xv[y];
+            const typename Mat::RowAcc LUrow = LU.row(y);
+            for (size_t x = y + 1; x < n; ++x)
+                p -= xv[x] * LUrow[x];
+            xv[y] = p * idiag[y];
+        }
+    }
+
+    Mat LU; // combined both lower right & upper left triangles matrix
+    T *idiag; // 1 / diag; length is L.size.w
+};
+
+// The coeff vector for a given position u on the spline describes the influence of each control point
+// towards the final result, ie.:
+// resultPoint(u) = SUM(Nrow[i] * controlpoint[i]) for i in [0..Nrow),
+//  where Nrow[] are the coefficients for u.
+// And to keep the parametrization simple, u is computed from t=0..1
+template<typename T>
+void computeCoeffVector(T * TBSP_RESTRICT Nrow, size_t numcp, T t01, const T * TBSP_RESTRICT knots, size_t degree)
+{
+    for(size_t i = 0; i < numcp; ++i)
+        Nrow[i] = T(0);
+
+    const size_t n = numcp - 1;
+
+    // special cases
+    if(t01 <= T(0))
+    {
+        Nrow[0] = T(1);
+        return;
+    }
+    else if(t01 >= T(1))
+    {
+        Nrow[n] = T(1);
+        return;
+    }
+
+    const size_t numknots = tbsp__getNumKnots(numcp, degree);
+    const size_t m = numknots - 1;
+    const T mink = knots[0];
+    const T maxk = knots[m];
+
+    // Position on the knot vector aka transform t=0..1 into knot vector space
+    const T u = mink + t01 * (maxk - mink);
+
+    // find index k, so that u is in [knots[k], knots[k+1])
+    const size_t k = detail::findKnotIndex(u, knots, numknots, degree);
+    Nrow[k] = T(1);
+
+    // Coefficient computation
+    // See also: https://pages.mtu.edu/~shene/COURSES/cs3621/NOTES/spline/B-spline/bspline-curve-coef.html
+    for(size_t d = 1; d <= degree; ++d)
+    {
+        TBSP_ASSERT(d <= k);
+        const T q = (knots[k+1] - u) / (knots[k+1] - knots[k-d+1]);
+        Nrow[k-d] = q * Nrow[k-d+1];
+
+        for(size_t i = k-d+1; i < k; ++i)
+        {
+            const T a = (u - knots[i]) / (knots[i+d] - knots[i]);
+            const T b = (knots[i+d+1] - u) / (knots[i+d+1] - knots[i+1]);
+            Nrow[i] = a * Nrow[i] + b * Nrow[i+1];
+        }
+
+        Nrow[k] *= ((u - knots[k]) / (knots[k+d] - knots[k]));
+    }
+}
+
+template<typename T>
+T *allocateCoeffMatrix(MatrixAcc<T>& N, T *mem, size_t nump, size_t numcp)
+{
+    N.size.w = numcp;
+    N.size.h = nump;
+    N.p = mem;
+    return mem + (nump * numcp);
+}
+
+template<typename T>
+void computeCoeffMatrix(MatrixAcc<T>& N, const T *knots, size_t degree)
+{
+    const size_t numcp = N.size.w;
+    const size_t nump = N.size.h;
+
+    const T invsz = T(1) / T(nump - 1);
+
+    for(size_t i = 0; i < nump; ++i) // rows
+    {
+        // TODO: this is the parametrization. currently, this is equidistant. allow more.
+        const T t01 = float(i) * invsz; // position of point assuming uniform parametrization
+
+        typename MatrixAcc<T>::RowAcc row = N.row(i);
+        computeCoeffVector(&row[0], numcp, t01, knots, degree); // row 0 stores all coefficients for _t[0], etc
+    }
+}
+
+template<typename T>
+Size2d getLeastSquaresSize(const MatrixAcc<T>& N)
+{
+    Size2d sz;
+    sz.w = N.size.w - 2;
+    sz.h = N.size.w - 2; // (h = w) is intended, M is a square matrix!
+    return sz;
+}
+
+// Generate least-squares approximator for spline approximation
+template<typename T>
+MatrixAcc<T> generateLeastSquares(T *mem, const MatrixAcc<T>& N)
+{
+    MatrixAcc<T> M;
+    M.p = mem;
+    M.size == getLeastSquaresSize(N);
+
+    T * const pend = M.p + (M.size.w * M.size.h);
+
+    matMultCenterCutTransposeWithSelf(M, N);
+
+    return M;
+}
+
+} // end namespace detail
+// --------------------------------------------------
+
+// Calculate how many elements of type T are needed as storage for the interpolator
+// These should ideally be constexpr, but we want C++98 compatibility.
+
+// For Interpolator::init()
+#define tbsp__getInterpolatorStorageSize(numControlPoints, numPoints)         \
+    ( ((numControlPoints) == (numControlPoints))                              \
+        ? (((numControlPoints)+1) * (numPoints)) /* solver(N) */              \
+        : (                                                                   \
+            ((numControlPoints) * (numPoints)) /* N */                        \
+          + (((numControlPoints)-1) * ((numControlPoints)-2)) /* solver(M) */ \
+          )                                                                   \
+    )
+
+// For Interpolator::refresh()
+#define tbsp__getInterpolatorRefreshTempSize(numControlPoints, numPoints)        \
+    ( (numControlPoints) == (numControlPoints)                                \
+        ? ((numControlPoints) * (numPoints)) /* N */                          \
+        : (((numControlPoints)-2) * ((numControlPoints)-2)) /* M */           \
+    )
+
+// For Interpolator::generateControlPoints()
+#define tbsp_getInterpolatorWorkTempSize(numControlPoints, numPoints) \
+    ( ((numControlPoints) == (numPoints)) ? 0 : ((numControlPoints) - 2) )
+
+// --------------------------------------------------
+
+
+// One interpolator is precomputed for a specific number of input points and output control points
+template<typename T>
+struct Interpolator
+{
+    inline Interpolator() : numcp(0), nump(0) {}
+
+    // Inits an Interpolator in the passed mem[]. This memory must outlive the Interpolator
+    // and be at least tbsp__getInterpolatorStorageSize() elements.
+    // You may move or copy the memory elsewhere and then call init() again to update internal
+    // pointers to point to the new location.
+    // This does not access mem at all, it only stores pointers.
+    // When the number of points or control points change, the memory requirements change too;
+    // in this case you must re-init(), generate a new knot vector, and refresh().
+    T *init(T *mem, size_t nump, size_t numcp);
+
+    // When the knot vector or degree changes, call this to update the internal matrix solver.
+    // Must be called before generateControlPoints() can be used.
+    // tmp[] must have space for at least tbsp__getInterpolatorRefreshTempSize() elements
+    // and can be discarded after the call returns.
+    bool refresh(T * TBSP_RESTRICT const tmp, const T * TBSP_RESTRICT knots, unsigned degree);
+
+    // Given a set of points P[nump], generate set of control points cp[numcp].
+    // nump, numcp are defined by the numbers passed to init().
+    // workmem can be NULL if only interpolation is used (#controlpoints == #points).
+    // Approximation (#controlpoints < #points) needs extra working memory though,
+    // so you must pass at least tbsp_getInterpolatorWorkTempSize() elements of P.
+    // Returns how many control points were generated, for convenience.
+    // The working memory can be discarded after the call returns.
+    template<typename P>
+    size_t generateControlPoints(P * cp, P * TBSP_RESTRICT workmem, const P *points) const;
+
+    inline size_t getNumGeneratedControlPoints() { return numcp; }
+    inline size_t getNumInputPoints() { return nump; }
+
+    // ------------------------
+protected:
+    size_t numcp, nump;
+    union
+    {
+        detail::Cholesky<T> cholesky;
+        detail::LUDecomp<T> ludecomp;
+    } solver;
+    detail::MatrixAcc<T> N;
+
+    // To make it checkable in if()
+#if TBSP_HAS_CPP11
+    explicit inline operator bool() const { return !!numcp; }
+#else // Explicit conversion operator is not supported in C++98
+    inline operator const void*() const { return numcp ? this : NULL; }
+#endif
+};
+
+
+
+template<typename T>
+T *Interpolator<T>::init(T * TBSP_RESTRICT const mem, size_t nump, size_t numcp)
+{
+    TBSP_ASSERT(mem);
+
+    // Calling this with 2 points is pointless, < 2 is mathematically impossible
+    if(nump < 2 || numcp < 2 || !mem)
+        return NULL;
+
+    // Can only generate less or equal control points than points
+    TBSP_ASSERT(numcp <= nump);
+    if(!(numcp <= nump))
+        return NULL;
+
+    T *p = mem;
+
+    if(nump == numcp)
+    {
+        detail::allocateCoeffMatrix<T>(N, NULL, nump, numcp); // just to get the size
+        p = solver.ludecomp.init(p, N.size);
+    }
+    else
+    {
+        p = detail::allocateCoeffMatrix<T>(N, p, nump, numcp);
+        p = solver.cholesky.init(p, detail::getLeastSquaresSize(N));
+    }
+
+    if(p)
+    {
+        TBSP_ASSERT(p <= mem + tbsp__getInterpolatorStorageSize(numcp, nump));
+
+        this->numcp = numcp;
+        this->nump = nump;
+    }
+
+    return p;
+}
+
+template<typename T>
+bool Interpolator<T>::refresh(T * TBSP_RESTRICT const tmp, const T * TBSP_RESTRICT knots, unsigned degree)
+{
+    // Need temporary memory
+    TBSP_ASSERT(tmp);
+
+    const size_t numcp = N.size.w;
+    const size_t nump = N.size.h;
+    TBSP_ASSERT(numcp && nump);
+
+    bool ret = false;
+
+    if(nump == numcp)
+    {
+        // N allocated in tmp, solver(N) in mem
+        T *p = detail::allocateCoeffMatrix(N, tmp, nump, numcp);
+        TBSP_ASSERT(p <= tmp + tbsp__getInterpolatorRefreshTempSize(numcp, nump));
+        detail::computeCoeffMatrix(N, knots, degree);
+
+        // N is point-symmetric, ie. NOT diagonally symmetric.
+        // This means we can't use Cholesky decomposition, but LU decomposition is fine.
+        // Note: Cholesky decomposition will at first appear to work,
+        // but the solutions calculated with it are wrong. Don't use this here.
+        solver.ludecomp.refresh(N);
+
+        // N is no longer needed
+        // solver(N) stays
+        N.p = NULL;
+        ret = true;
+    }
+    else
+    {
+        // N allocated in mem, solver(M) in mem, M in tmp
+        detail::computeCoeffMatrix(N, knots, degree);
+
+        // This calculates M = T(N') x N', where N' is N with its borders removed.
+        // A nice property is that M is diagonally symmetric,
+        // means it can be efficiently solved via cholesky decomposition.
+        const detail::MatrixAcc<T> M = detail::generateLeastSquares(tmp, N);
+        TBSP_ASSERT(M.p + (M.size.w * M.size.h) <= tmp + tbsp__getInterpolatorRefreshTempSize(numcp, nump));
+
+        ret = solver.cholesky.refresh(M);
+
+        // M is no longer needed
+        // solver(M) stays
+        // N stays
+    }
+
+    return ret;
+}
+
+template<typename T> template<typename P>
+size_t Interpolator<T>::generateControlPoints(P * cp, P * TBSP_RESTRICT workmem, const P *points) const
+{
+    if(numcp == nump)
+    {
+        // This does not need extra working memory
+        solver.ludecomp.solve(cp, points);
+    }
+    else
+    {
+        // Approximate points with less control points, this is more costly
+        TBSP_ASSERT(workmem); // ... and needs extra memory
+        const size_t h = numcp - 1;
+        const size_t n = nump - 1;
+        const P p0 = points[0];
+        const P pn = points[n];
+
+        // Wrap the least squares estimator somehow together with the points to approximate...
+        // Unfortunately I forgot how this works, so no explanation of mathemagics here, sorry :<
+        const P initial = points[1] - (p0 * N(0,1)) - (pn * N(h,1));
+        for(size_t i = 1; i < h; ++i)
+        {
+            P tmp = initial * N(i,1);
+            for(size_t k = 2; k < n; ++k)
+            {
+                const typename detail::MatrixAcc<T>::RowAcc Nrow = N.row(k);
+                tmp += (points[k] - (p0 * Nrow[0]) - (pn * Nrow[h])) * Nrow[i];
+            }
+            workmem[i-1] = tmp;
+        }
+
+        // End points are not interpolated
+        cp[0] = p0;
+        cp[h] = pn;
+
+        // Solve for all points that are not endpoints
+        TBSP_ASSERT(solver.cholesky.L.size.w == h - 1);
+        solver.cholesky.solve(cp + 1, workmem);
+    }
+    return numcp;
+}
+
+
+// Helper functions for parametrization
+
+template<typename T, typename P>
+void chordal(T *parm, const P *points, size_t n)
+{
+    // Start and end points are always 0 and 1, respectively
+    parm[0] = T(0);
+    parm[--n] = T(1);
+
+    T totaldist = 0;
+    for(size_t i = 1; i < n; ++i)
+    {
+        const T dist = distance(points[i-1], points[i]);
+        totaldist += dist;
+        parm[i] = totaldist
+    }
+
+    // Normalize to 0..1
+    const T m = T(1) / totaldist;
+    for(size_t i = 1; i < n; ++i)
+        parm[i] *= m;
+}
+
+template<typename T, typename P>
+void uniform(T *parm, const P *points, size_t n)
+{
+    const T m = T(1) / T(n - 1);
+    for(size_t i = 0; i < n; ++i)
+        parm[i] = T(i) * m;
+}
+
 } // end namespace tbsp