From 7d2f9615738b173990d0de08a7c218101c00d8b2 Mon Sep 17 00:00:00 2001 From: fgenesis Date: Fri, 5 Jul 2024 02:10:27 +0200 Subject: [PATCH] tbsp: split into alloc, prep, generation steps --- Aquaria/AnimationEditor.cpp | 1 + BBGE/Interpolators.cpp | 17 ++- ExternalLibs/tbsp.hh | 260 +++++++++++++++++++++++------------- 3 files changed, 181 insertions(+), 97 deletions(-) diff --git a/Aquaria/AnimationEditor.cpp b/Aquaria/AnimationEditor.cpp index 0ca2d0d..a6db6d5 100644 --- a/Aquaria/AnimationEditor.cpp +++ b/Aquaria/AnimationEditor.cpp @@ -1006,6 +1006,7 @@ void AnimationEditor::editStripKey() BoneKeyframe *bk = a->getKeyframe(currentKey)->getBoneKeyframe(editingBone->boneIdx); assert(bk->controlpoints.size() == interp->bsp.ctrlX() * interp->bsp.ctrlY()); + assert(!splinegrid); splinegrid = new SplineGrid; DynamicRenderGrid *rgrid = splinegrid->resize(interp->bsp.ctrlX(), interp->bsp.ctrlY(), grid->width(), grid->height(), interp->bsp.degX(), interp->bsp.degY()); diff --git a/BBGE/Interpolators.cpp b/BBGE/Interpolators.cpp index ad58153..66424c4 100644 --- a/BBGE/Interpolators.cpp +++ b/BBGE/Interpolators.cpp @@ -119,11 +119,12 @@ void BSpline2D::resize(size_t cx, size_t cy, unsigned degx, unsigned degy) const size_t interpStorageSizeX = tbsp__getInterpolatorStorageSize(cx, cx); const size_t interpStorageSizeY = tbsp__getInterpolatorStorageSize(cy, cy); - const size_t interpInitTempSize = tbsp__getInterpolatorInitTempSize(maxCp, maxCp); + const size_t interpRefreshTempSize = tbsp__getInterpolatorRefreshTempSize(maxCp, maxCp); const size_t interpStorageNeeded = interpStorageSizeX + interpStorageSizeY; if(_ext && _ext->capacity < interpStorageNeeded) { + _ext->~Extended(); free(_ext); _ext = NULL; } @@ -139,13 +140,17 @@ void BSpline2D::resize(size_t cx, size_t cy, unsigned degx, unsigned degy) if(_ext) { // Some extra temp memory is required during init, but can be discarded right afterward - std::vector interptmp(interpInitTempSize); + std::vector interptmp(interpRefreshTempSize); float *mx = _ext->floats(); float *my = mx + interpStorageSizeX; - _ext->interp.x = tbsp::initInterpolator(mx, &interptmp[0], degx, cx, cx, &knotsX[0]); - _ext->interp.y = tbsp::initInterpolator(my, &interptmp[0], degy, cy, cy, &knotsY[0]); + _ext->interp.x.init(mx, cx, cx); + _ext->interp.x.refresh(&interptmp[0], &knotsX[0], degx); + + _ext->interp.y.init(my, cy, cy); + _ext->interp.y.refresh(&interptmp[0], &knotsY[0], degy); + _ext->tmp2d.init(cx, cy); } } @@ -168,7 +173,7 @@ void BSpline2D::recalc(Vector* dst, size_t xres, size_t yres, const Vector *cont for(size_t i = 0; i < _cpy; ++i, src += _cpx) tmpin[i] = *src; - tbsp::generateControlPoints(&tmpcp[0], NULL, _ext->interp.y, &tmpin[0]); + _ext->interp.y.generateControlPoints(&tmpcp[0], NULL, &tmpin[0]); for(size_t y = 0; y < _cpy; ++y) tmp2d(x, y) = tmpcp[y]; @@ -179,7 +184,7 @@ void BSpline2D::recalc(Vector* dst, size_t xres, size_t yres, const Vector *cont { Vector *row = tmp2d.row(y); memcpy(&tmpin[0], row, sizeof(Vector) * _cpx); - tbsp::generateControlPoints(row, NULL, _ext->interp.x, &tmpin[0]); + _ext->interp.x.generateControlPoints(row, NULL, &tmpin[0]); } controlpoints = tmp2d.data(); diff --git a/ExternalLibs/tbsp.hh b/ExternalLibs/tbsp.hh index beb76e8..89bd81b 100644 --- a/ExternalLibs/tbsp.hh +++ b/ExternalLibs/tbsp.hh @@ -17,7 +17,7 @@ Requirements: Design notes: - C++98 compatible. Stand-alone. -- No memory allocation; all memory is caller-controlled. +- No memory allocation; all memory is caller-controlled and caller-owned. In cases where this is needed, the caller needs to pass appropriately-sized working memory. @@ -298,6 +298,8 @@ namespace detail { struct Size2d { size_t w, h; + + inline bool operator==(const Size2d& o) const { return w == o.w && h == o.h; } }; // Wrap any pointer to access it like a vector (with proper bounds checking) @@ -460,19 +462,28 @@ struct Cholesky // Initializes a solver in the given memory. // On success, bumps ptr forward by the memory it needs; - // on failure, returns NULL. Note that this may also fail if A is not well-formed. + // on failure, returns NULL. // Memory needed: (sizeof(T) * ((A.size.w + 1) * A.size.h)) - T *init(T *mem, const Mat& A) + T *init(T *mem, Size2d size) { - T * const pdiag = mem + (A.size.w * A.size.h); - T * const myend = pdiag + A.size.w; + T * const pdiag = mem + (size.w * size.h); + T * const myend = pdiag + size.w; - TBSP_ASSERT(A.size.w >= A.size.h); - const size_t n = A.size.w; + L.p = mem; + L.size = size; idiag = pdiag; // T[n] + return myend; + } - L.p = mem; - L.size = A.size; + // Sets the matrix A used for solving later. The solver is preconditioned + // for that matrix so that in (A * x + b) b is the only variable. + // May fail if A is not well formed. + bool refresh(const Mat& A) + { + TBSP_ASSERT(A.size == L.size); + TBSP_ASSERT(A.size.w >= A.size.h); + + const size_t n = L.size.w; // Fill the lower left triangle, storing the diagonal separately for(size_t y = 0; y < n; ++y) @@ -491,7 +502,7 @@ struct Cholesky else { TBSP_ASSERT(0 && "Cholesky decomposition failed"); - return NULL; + return false; } } } @@ -505,7 +516,7 @@ struct Cholesky row[x] = zero; } - return myend; + return true; } // Solves x for A * x + b. Both b and x must have length n. @@ -545,21 +556,28 @@ struct LUDecomp // Initializes a solver in the given memory. // On success, bumps ptr forward by the memory it needs; - // on failure, returns NULL. Note that this may also fail if A is not well-formed. + // on failure, returns NULL. // Memory needed: (sizeof(T) * ((A.size.w + 1) * A.size.h)) - T *init(T *mem, const Mat& A) + T *init(T *mem, Size2d size) { - T * const pdiag = mem + (A.size.w * A.size.h); - T * const myend = pdiag + A.size.w; + T * const pdiag = mem + (size.w * size.h); + T * const myend = pdiag + size.w; - TBSP_ASSERT(A.size.w == A.size.h); + TBSP_ASSERT(size.w == size.h); - MatrixAcc LU; LU.p = mem; + LU.size = size; idiag = pdiag; - LU.size = A.size; + return myend; + } + + // Sets the matrix A used for solving later. The solver is preconditioned + // for that matrix so that in (A * x + b) b is the only variable. + void refresh(const Mat& A) + { + TBSP_ASSERT(A.size == LU.size); - const size_t n = A.size.w; + const size_t n = LU.size.w; // Note: This doesn't do pivoting. for (size_t y = 0; y < n; ++y) @@ -581,9 +599,6 @@ struct LUDecomp LU(y, x) = idiagval * e; } } - this->LU = LU; - - return myend; } // Solves x for A * x + b. Both b and x must have length n. @@ -670,14 +685,20 @@ void computeCoeffVector(T * TBSP_RESTRICT Nrow, size_t numcp, T t01, const T * T } } - template -T* computeCoeffMatrix(MatrixAcc& N, T *mem, const T *knots, size_t nump, size_t numcp, size_t degree) +T *allocateCoeffMatrix(MatrixAcc& N, T *mem, size_t nump, size_t numcp) { N.size.w = numcp; N.size.h = nump; N.p = mem; - T * const pend = N.p + (nump * numcp); + return mem + (nump * numcp); +} + +template +void computeCoeffMatrix(MatrixAcc& N, const T *knots, size_t degree) +{ + const size_t numcp = N.size.w; + const size_t nump = N.size.h; const T invsz = T(1) / T(nump - 1); @@ -689,34 +710,95 @@ T* computeCoeffMatrix(MatrixAcc& N, T *mem, const T *knots, size_t nump, size typename MatrixAcc::RowAcc row = N.row(i); computeCoeffVector(&row[0], numcp, t01, knots, degree); // row 0 stores all coefficients for _t[0], etc } +} - return pend; +template +Size2d getLeastSquaresSize(const MatrixAcc& N) +{ + Size2d sz; + sz.w = N.size.w - 2; + sz.h = N.size.w - 2; // (h = w) is intended, M is a square matrix! + return sz; } // Generate least-squares approximator for spline approximation template -T* generateLeastSquares(MatrixAcc& M, T *mem, const MatrixAcc& N) +MatrixAcc generateLeastSquares(T *mem, const MatrixAcc& N) { + MatrixAcc M; M.p = mem; - M.size.w = N.size.w - 2; - M.size.h = N.size.w - 2; // (h = w) is intended, M is a square matrix! + M.size == getLeastSquaresSize(N); T * const pend = M.p + (M.size.w * M.size.h); matMultCenterCutTransposeWithSelf(M, N); - return pend; + return M; } } // end namespace detail // -------------------------------------------------- +// Calculate how many elements of type T are needed as storage for the interpolator +// These should ideally be constexpr, but we want C++98 compatibility. + +// For Interpolator::init() +#define tbsp__getInterpolatorStorageSize(numControlPoints, numPoints) \ + ( ((numControlPoints) == (numControlPoints)) \ + ? (((numControlPoints)+1) * (numPoints)) /* solver(N) */ \ + : ( \ + ((numControlPoints) * (numPoints)) /* N */ \ + + (((numControlPoints)-1) * ((numControlPoints)-2)) /* solver(M) */ \ + ) \ + ) + +// For Interpolator::refresh() +#define tbsp__getInterpolatorRefreshTempSize(numControlPoints, numPoints) \ + ( (numControlPoints) == (numControlPoints) \ + ? ((numControlPoints) * (numPoints)) /* N */ \ + : (((numControlPoints)-2) * ((numControlPoints)-2)) /* M */ \ + ) + +// For Interpolator::generateControlPoints() +#define tbsp_getInterpolatorWorkTempSize(numControlPoints, numPoints) \ + ( ((numControlPoints) == (numPoints)) ? 0 : ((numControlPoints) - 2) ) + +// -------------------------------------------------- + + // One interpolator is precomputed for a specific number of input points and output control points template struct Interpolator { inline Interpolator() : numcp(0), nump(0) {} + // Inits an Interpolator in the passed mem[]. This memory must outlive the Interpolator + // and be at least tbsp__getInterpolatorStorageSize() elements. + // You may move or copy the memory elsewhere and then call init() again to update internal + // pointers to point to the new location. + // This does not access mem at all, it only stores pointers. + // When the number of points or control points change, the memory requirements change too; + // in this case you must re-init(), generate a new knot vector, and refresh(). + T *init(T *mem, size_t nump, size_t numcp); + + // When the knot vector or degree changes, call this to update the internal matrix solver. + // Must be called before generateControlPoints() can be used. + // tmp[] must have space for at least tbsp__getInterpolatorRefreshTempSize() elements + // and can be discarded after the call returns. + bool refresh(T * TBSP_RESTRICT const tmp, const T * TBSP_RESTRICT knots, unsigned degree); + + // Given a set of points P[nump], generate set of control points cp[numcp]. + // nump, numcp are defined by the numbers passed to init(). + // workmem can be NULL if only interpolation is used (#controlpoints == #points). + // Approximation (#controlpoints < #points) needs extra working memory though, + // so you must pass at least tbsp_getInterpolatorWorkTempSize() elements of P. + // Returns how many control points were generated, for convenience. + // The working memory can be discarded after the call returns. + template + size_t generateControlPoints(P * cp, P * TBSP_RESTRICT workmem, const P *points) const; + + // ------------------------ +protected: size_t numcp, nump; union { @@ -733,120 +815,116 @@ struct Interpolator #endif }; -// Calculate how many elements of type T are needed as storage for the interpolator -// This should ideally be constexpr, but we want C++98 compatibility. -#define tbsp__getInterpolatorStorageSize(numControlPoints, numPoints) \ - ( ((numControlPoints) == (numControlPoints)) \ - ? (((numControlPoints)+1) * (numPoints)) /* solver(N) */ \ - : ( \ - ((numControlPoints) * (numPoints)) /* N */ \ - + (((numControlPoints)-1) * ((numControlPoints)-2)) /* solver(M) */ \ - ) \ - ) -#define tbsp__getInterpolatorInitTempSize(numControlPoints, numPoints) \ - ( (numControlPoints) == (numControlPoints) \ - ? ((numControlPoints) * (numPoints)) /* N */ \ - : (((numControlPoints)-2) * ((numControlPoints)-2)) /* M */ \ - ) template -Interpolator initInterpolator(T * TBSP_RESTRICT const mem, T * TBSP_RESTRICT const tmp, size_t degree, size_t nump, size_t numcp, const T * TBSP_RESTRICT knots) +T *Interpolator::init(T * TBSP_RESTRICT const mem, size_t nump, size_t numcp) { - Interpolator interp; + TBSP_ASSERT(mem); // Calling this with 2 points is pointless, < 2 is mathematically impossible - if(nump < 2 || numcp < 2) - return interp; + if(nump < 2 || numcp < 2 || !mem) + return false; // Can only generate less or equal control points than points TBSP_ASSERT(numcp <= nump); if(!(numcp <= nump)) - return interp; + return false; + + T *p = mem; + + if(nump == numcp) + { + detail::allocateCoeffMatrix(N, NULL, nump, numcp); // just to get the size + p = solver.ludecomp.init(p, N.size); + } + else + { + p = detail::allocateCoeffMatrix(N, p, nump, numcp); + p = solver.cholesky.init(p, detail::getLeastSquaresSize(N)); + } + + if(p) + { + TBSP_ASSERT(p <= mem + tbsp__getInterpolatorStorageSize(numcp, nump)); + + this->numcp = numcp; + this->nump = nump; + } + + return p; +} - // Need storage memory - TBSP_ASSERT(mem && tmp); +template +bool Interpolator::refresh(T * TBSP_RESTRICT const tmp, const T * TBSP_RESTRICT knots, unsigned degree) +{ + // Need temporary memory + TBSP_ASSERT(tmp); + + const size_t numcp = N.size.w; + const size_t nump = N.size.h; + TBSP_ASSERT(numcp && nump); - T *p = numcp == nump ? tmp : mem; - p = detail::computeCoeffMatrix(interp.N, p, knots, nump, numcp, degree); - if(!p) - return interp; - TBSP_ASSERT(p <= tmp + tbsp__getInterpolatorInitTempSize(numcp, nump)); + bool ret = false; if(nump == numcp) { // N allocated in tmp, solver(N) in mem + T *p = detail::allocateCoeffMatrix(N, tmp, nump, numcp); + TBSP_ASSERT(p <= tmp + tbsp__getInterpolatorRefreshTempSize(numcp, nump)); + detail::computeCoeffMatrix(N, knots, degree); // N is point-symmetric, ie. NOT diagonally symmetric. // This means we can't use Cholesky decomposition, but LU decomposition is fine. // Note: Cholesky decomposition will at first appear to work, // but the solutions calculated with it are wrong. Don't use this here. - p = interp.solver.ludecomp.init(mem, interp.N); + solver.ludecomp.refresh(N); // N is no longer needed // solver(N) stays - interp.N.p = NULL; - interp.N.size.w = interp.N.size.h = 0; + N.p = NULL; + ret = true; } else { // N allocated in mem, solver(M) in mem, M in tmp + detail::computeCoeffMatrix(N, knots, degree); // This calculates M = T(N') x N', where N' is N with its borders removed. // A nice property is that M is diagonally symmetric, // means it can be efficiently solved via cholesky decomposition. - detail::MatrixAcc M; - p = detail::generateLeastSquares(M, tmp, interp.N); - if(!p) - return interp; - TBSP_ASSERT(p <= tmp + tbsp__getInterpolatorInitTempSize(numcp, nump)); + const detail::MatrixAcc M = detail::generateLeastSquares(tmp, N); + TBSP_ASSERT(M.p + (M.size.w * M.size.h) <= tmp + tbsp__getInterpolatorRefreshTempSize(numcp, nump)); - p = interp.solver.cholesky.init(mem, M); + ret = solver.cholesky.refresh(M); // M is no longer needed // solver(M) stays // N stays } - if(p) - { - TBSP_ASSERT(p <= mem + tbsp__getInterpolatorStorageSize(numcp, nump)); - - interp.numcp = numcp; - interp.nump = nump; - } - - return interp; + return ret; } -#define tbsp_getInterpolatorWorkSize(numControlPoints, numPoints) \ - ( ((numControlPoints) == (numPoints)) ? 0 : ((numControlPoints) - 2) ) - -// Pass workmem[] with at least tbsp_getInterpolatorWorkSize() elements; -// workmem can be NULL if only interpolation is used (#controlpoints == #points). -// Approximation (#controlpoints < #points) needs extra working memory though. -// Returns how many control points were generated, for convenience. -template -size_t generateControlPoints(P * cp, P * TBSP_RESTRICT workmem, const Interpolator& interp, const P *points) +template template +size_t Interpolator::generateControlPoints(P * cp, P * TBSP_RESTRICT workmem, const P *points) const { - const size_t numcp = interp.numcp; - if(numcp == interp.nump) + if(numcp == nump) { // This does not need extra working memory - interp.solver.ludecomp.solve(cp, points); + solver.ludecomp.solve(cp, points); } else { // Approximate points with less control points, this is more costly TBSP_ASSERT(workmem); // ... and needs extra memory const size_t h = numcp - 1; - const size_t n = interp.nump - 1; + const size_t n = nump - 1; const P p0 = points[0]; const P pn = points[n]; // Wrap the least squares estimator somehow together with the points to approximate... // Unfortunately I forgot how this works, so no explanation of mathemagics here, sorry :< - const detail::MatrixAcc N = interp.N; const P initial = points[1] - (p0 * N(0,1)) - (pn * N(h,1)); for(size_t i = 1; i < h; ++i) { @@ -864,8 +942,8 @@ size_t generateControlPoints(P * cp, P * TBSP_RESTRICT workmem, const Interpolat cp[h] = pn; // Solve for all points that are not endpoints - TBSP_ASSERT(interp.solver.cholesky.L.size.w == h - 1); - interp.solver.cholesky.solve(cp + 1, workmem); + TBSP_ASSERT(solver.cholesky.L.size.w == h - 1); + solver.cholesky.solve(cp + 1, workmem); } return numcp; }