From efa4313aa57e4c3511eb1d5d88edc37e99f899fa Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@ipecompute4.ands.kit.edu>
Date: Tue, 6 Sep 2022 19:12:09 +0200
Subject: Add all CCPi patches (patches are not applied automatically, but just
 collected)

---
 .../cone_bp.cu                                     | 397 +++++++++++
 .../cone_fp.cu                                     | 516 ++++++++++++++
 .../par3d_bp.cu                                    | 327 +++++++++
 .../par3d_fp.cu                                    | 770 +++++++++++++++++++++
 .../rounding.h                                     |  50 ++
 .../CMakeLists.txt                                 | 141 ++++
 .../FGP_TV_core.c                                  | 266 +++++++
 .../FGP_TV_core.h                                  |  62 ++
 .../CMakeLists.txt                                 | 169 +++++
 .../TNV_core.c                                     | 668 ++++++++++++++++++
 .../TNV_core.h                                     |  47 ++
 .../TNV_core_loop.h                                | 107 +++
 ...larisation-toolkit-fgptv-integer-overflow.patch |  12 +
 .../ccpi-regularisation-toolkit-fgptv-openmp.patch |  19 +
 14 files changed, 3551 insertions(+)
 create mode 100644 patches/astra-toolbox-approximate-projectors/cone_bp.cu
 create mode 100644 patches/astra-toolbox-approximate-projectors/cone_fp.cu
 create mode 100644 patches/astra-toolbox-approximate-projectors/par3d_bp.cu
 create mode 100644 patches/astra-toolbox-approximate-projectors/par3d_fp.cu
 create mode 100644 patches/astra-toolbox-approximate-projectors/rounding.h
 create mode 100644 patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt
 create mode 100644 patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c
 create mode 100644 patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h
 create mode 100644 patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt
 create mode 100755 patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c
 create mode 100644 patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h
 create mode 100644 patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h
 create mode 100644 patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch
 create mode 100644 patches/ccpi-regularisation-toolkit-fgptv-openmp.patch
diff --git a/patches/astra-toolbox-approximate-projectors/cone_bp.cu b/patches/astra-toolbox-approximate-projectors/cone_bp.cu
new file mode 100644
index 0000000..01edcb9
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/cone_bp.cu
@@ -0,0 +1,397 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 6;
+
+static const unsigned int g_anglesPerBlock = 32;
+static const unsigned int g_volBlockX = 16;
+static const unsigned int g_volBlockY = 32;
+
+static const unsigned g_MaxAngles = 1024;
+
+struct DevConeParams {
+	float4 fNumU;
+	float4 fNumV;
+	float4 fDen;
+};
+
+__constant__ DevConeParams gC_C[g_MaxAngles];
+
+#include "rounding.h"
+
+//__launch_bounds__(32*16, 4)
+template<bool FDKWEIGHT, unsigned int ZSIZE>
+__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch,
+                            cudaTextureObject_t tex,
+                            int startAngle, int angleOffset,
+                            const astraCUDA3d::SDimensions3D dims,
+                            float fOutputScale)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles - angleOffset)
+		endAngle = dims.iProjAngles - angleOffset;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+	//            y = z
+
+
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	const float fX = X - 0.5f*dims.iVolX + 0.5f;
+	const float fY = Y - 0.5f*dims.iVolY + 0.5f;
+	const float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+	float Z[ZSIZE];
+	for(int i=0; i < ZSIZE; i++)
+		Z[i] = 0.0f;
+
+
+	{
+		float fAngle = startAngle + angleOffset + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+			float4 fCu  = gC_C[angle].fNumU;
+			float4 fCv  = gC_C[angle].fNumV;
+			float4 fCd  = gC_C[angle].fDen;
+
+			float fUNum = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z;
+			float fVNum = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z;
+			float fDen  = (FDKWEIGHT ? 1.0f : fCd.w) + fX * fCd.x + fY * fCd.y + fZ * fCd.z;
+
+			float fU,fV, fr;
+
+			for (int idx = 0; idx < ZSIZE; idx++)
+			{
+				fr = __fdividef(1.0f, fDen);
+				fU = fUNum * fr;
+				fV = fVNum * fr;
+
+				float fUf = round(fU) - 0.5f;
+				float fVf = round(fV) - 0.5f;
+
+				textype fU_ = texto(fU);
+				textype fV_ = texto(fV);
+				textype fUf_ = texto(fUf);
+				textype fVf_ = texto(fVf);
+				
+				textype fVal0_0; textocheck(fVal0_0, "bp", tex3D<float>(tex, fUf, fAngle, fVf));
+				textype fVal1_0; textocheck(fVal1_0, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf));
+				textype fVal0_1; textocheck(fVal0_1, "bp", tex3D<float>(tex, fUf, fAngle, fVf + 1.0f));
+				textype fVal1_1; textocheck(fVal1_1, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf + 1.0f));
+
+				textype fVal0 = interpolate(fVal0_0, fVal0_1, (fV_ - fVf_));
+				textype fVal1 = interpolate(fVal1_0, fVal1_1, (fV_ - fVf_));
+				float fVal = texfrom(interpolate(fVal0, fVal1, (fU_ - fUf_)));
+
+//				float fVal = tex3D<float>(tex, fU, fAngle, fV);
+				Z[idx] += fr*fr*fVal;
+
+				fUNum += fCu.z;
+				fVNum += fCv.z;
+				fDen  += fCd.z;
+			}
+		}
+	}
+
+	int endZ = ZSIZE;
+	if (endZ > dims.iVolZ - startZ)
+		endZ = dims.iVolZ - startZ;
+
+	for(int i=0; i < endZ; i++)
+		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale;
+} //End kernel
+
+
+
+// supersampling version
+__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles - angleOffset)
+		endAngle = dims.iProjAngles - angleOffset;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+    //            y = z
+
+
+	// TO TRY: precompute part of detector intersection formulas in shared mem?
+	// TO TRY: inner loop over z, gather ray values in shared mem
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	int endZ = startZ + g_volBlockZ;
+	if (endZ > dims.iVolZ)
+		endZ = dims.iVolZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+	const float fSubStep = 1.0f/iRaysPerVoxelDim;
+
+	fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim);
+
+
+	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+	{
+
+		float fVal = 0.0f;
+		float fAngle = startAngle + angleOffset + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+			float4 fCu  = gC_C[angle].fNumU;
+			float4 fCv  = gC_C[angle].fNumV;
+			float4 fCd  = gC_C[angle].fDen;
+
+			float fXs = fX;
+			for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) {
+			float fYs = fY;
+			for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) {
+			float fZs = fZ;
+			for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) {
+
+				const float fUNum = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z;
+				const float fVNum = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z;
+				const float fDen  = fCd.w + fXs * fCd.x + fYs * fCd.y + fZs * fCd.z;
+
+				const float fr = __fdividef(1.0f, fDen);
+				const float fU = fUNum * fr;
+				const float fV = fVNum * fr;
+
+				fVal += tex3D<float>(tex, fU, fAngle, fV) * fr * fr;
+
+				fZs += fSubStep;
+			}
+			fYs += fSubStep;
+			}
+			fXs += fSubStep;
+			}
+
+		}
+
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale;
+	}
+}
+
+
+bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params)
+{
+	DevConeParams *p = new DevConeParams[iProjAngles];
+
+	// We need three things in the kernel:
+	// projected coordinates of pixels on the detector:
+
+	// u: || (x-s) v (s-d) || / || u v (s-x) ||
+	// v: -|| u (x-s) (s-d) || / || u v (s-x) ||
+
+	// ray density weighting factor for the adjoint
+	// || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 )
+
+	// FDK weighting factor
+	// ( || u v s || / || u v (s-x) || ) ^ 2
+
+	// Since u and v are ratios with the same denominator, we have
+	// a degree of freedom to scale the denominator. We use that to make
+	// the square of the denominator equal to the relevant weighting factor.
+
+
+	for (unsigned int i = 0; i < iProjAngles; ++i) {
+		Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ);
+		Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ);
+		Vec3 s(angles[i].fSrcX, angles[i].fSrcY, angles[i].fSrcZ);
+		Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ);
+
+
+
+		double fScale;
+		if (!params.bFDKWeighting) {
+			// goal: 1/fDen^2 = || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 )
+			// fDen = ( sqrt(|cross(u,v)|) * || u v (s-x) || ) / || u v (s-d) || 
+			// i.e. scale = sqrt(|cross(u,v)|) * / || u v (s-d) ||
+
+
+			// NB: for cross(u,v) we invert the volume scaling (for the voxel
+			// size normalization) to get the proper dimensions for
+			// the scaling of the adjoint
+
+			fScale = sqrt(scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm()) / det3(u, v, s-d);
+		} else {
+			// goal: 1/fDen = || u v s || / || u v (s-x) ||
+			// fDen = || u v (s-x) || / || u v s ||
+			// i.e., scale = 1 / || u v s ||
+
+			fScale = 1.0 / det3(u, v, s);
+		}
+
+		p[i].fNumU.w = fScale * det3(s,v,d);
+		p[i].fNumU.x = fScale * det3x(v,s-d);
+		p[i].fNumU.y = fScale * det3y(v,s-d);
+		p[i].fNumU.z = fScale * det3z(v,s-d);
+		p[i].fNumV.w = -fScale * det3(s,u,d);
+		p[i].fNumV.x = -fScale * det3x(u,s-d);
+		p[i].fNumV.y = -fScale * det3y(u,s-d);
+		p[i].fNumV.z = -fScale * det3z(u,s-d);
+		p[i].fDen.w = fScale * det3(u, v, s); // == 1.0 for FDK
+		p[i].fDen.x = -fScale * det3x(u, v);
+		p[i].fDen.y = -fScale * det3y(u, v);
+		p[i].fDen.z = -fScale * det3z(u, v);
+	}
+
+	// TODO: Check for errors
+	cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevConeParams), 0, cudaMemcpyHostToDevice);
+
+	delete[] p;
+
+	return true;
+}
+
+
+bool ConeBP_Array(cudaPitchedPtr D_volumeData,
+                  cudaArray *D_projArray,
+                  const SDimensions3D& dims, const SConeProjection* angles,
+                  const SProjectorParams3D& params)
+{
+	cudaTextureObject_t D_texObj;
+	if (!createTextureObject3D(D_projArray, D_texObj))
+		return false;
+
+	float fOutputScale;
+	if (params.bFDKWeighting) {
+		// NB: assuming cube voxels here
+		fOutputScale = params.fOutputScale / (params.fVolScaleX);
+	} else {
+		fOutputScale = params.fOutputScale * (params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ);
+	}
+
+	bool ok = true;
+
+	for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) {
+		unsigned int angleCount = g_MaxAngles;
+		if (th + angleCount > dims.iProjAngles)
+			angleCount = dims.iProjAngles - th;
+
+		ok = transferConstants(angles, angleCount, params);
+		if (!ok)
+			break;
+
+		dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+		dim3 dimGrid(((dims.iVolX/1+g_volBlockX-1)/(g_volBlockX))*((dims.iVolY/1+1*g_volBlockY-1)/(1*g_volBlockY)), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+		// timeval t;
+		// tic(t);
+
+		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
+		// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
+			if (params.bFDKWeighting) {
+				if (dims.iVolZ == 1) {
+					dev_cone_BP<true, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+				} else {
+					dev_cone_BP<true, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+				}
+			} else if (params.iRaysPerVoxelDim == 1) {
+				if (dims.iVolZ == 1) {
+					dev_cone_BP<false, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+				} else {
+					dev_cone_BP<false, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+				}
+			} else
+				dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
+		}
+
+		// TODO: Consider not synchronizing here, if possible.
+		ok = checkCuda(cudaThreadSynchronize(), "cone_bp");
+		if (!ok)
+			break;
+
+		angles = angles + angleCount;
+		// printf("%f\n", toc(t));
+
+	}
+
+	cudaDestroyTextureObject(D_texObj);
+
+	return ok;
+}
+
+bool ConeBP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SConeProjection* angles,
+            const SProjectorParams3D& params)
+{
+	// transfer projections to array
+
+	cudaArray* cuArray = allocateProjectionArray(dims);
+	transferProjectionsToArray(D_projData, cuArray, dims);
+
+	bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles, params);
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/cone_fp.cu b/patches/astra-toolbox-approximate-projectors/cone_fp.cu
new file mode 100644
index 0000000..f11813c
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/cone_fp.cu
@@ -0,0 +1,516 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 4;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_SrcZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+// x=0, y=1, z=2
+struct DIR_X {
+	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; }
+	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; }
+	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+	__device__ float c0(float x, float y, float z) const { return x; }
+	__device__ float c1(float x, float y, float z) const { return y; }
+	__device__ float c2(float x, float y, float z) const { return z; }
+	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); }
+	__device__ float x(float f0, float f1, float f2) const { return f0; }
+	__device__ float y(float f0, float f1, float f2) const { return f1; }
+	__device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// y=0, x=1, z=2
+struct DIR_Y {
+	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; }
+	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+	__device__ float c0(float x, float y, float z) const { return y; }
+	__device__ float c1(float x, float y, float z) const { return x; }
+	__device__ float c2(float x, float y, float z) const { return z; }
+	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); }
+	__device__ float x(float f0, float f1, float f2) const { return f1; }
+	__device__ float y(float f0, float f1, float f2) const { return f0; }
+	__device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// z=0, x=1, y=2
+struct DIR_Z {
+	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; }
+	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; }
+	__device__ float c0(float x, float y, float z) const { return z; }
+	__device__ float c1(float x, float y, float z) const { return x; }
+	__device__ float c2(float x, float y, float z) const { return y; }
+	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); }
+	__device__ float x(float f0, float f1, float f2) const { return f1; }
+	__device__ float y(float f0, float f1, float f2) const { return f2; }
+	__device__ float z(float f0, float f1, float f2) const { return f0; }
+};
+
+struct SCALE_CUBE {
+	float fOutputScale;
+	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; }
+};
+
+struct SCALE_NONCUBE {
+	float fScale1;
+	float fScale2;
+	float fOutputScale;
+	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; }
+};
+
+
+bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles)
+{
+	// transfer angles to constant memory
+	float* tmp = new float[iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(SrcX);
+	TRANSFER_TO_CONSTANT(SrcY);
+	TRANSFER_TO_CONSTANT(SrcZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	return true;
+}
+
+
+#include "rounding.h"
+
+	// threadIdx: x = ??? detector  (u?)
+	//            y = relative angle
+
+	// blockIdx:  x = ??? detector  (u+v?)
+    //            y = angle block
+
+template<class COORD, class SCALE>
+__global__ void cone_FP_t(float* D_projData, unsigned int projPitch,
+                          cudaTextureObject_t tex,
+                          unsigned int startSlice,
+                          unsigned int startAngle, unsigned int endAngle,
+                          const SDimensions3D dims,
+                          SCALE sc)
+{
+	COORD c;
+
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+	if (angle >= endAngle)
+		return;
+
+	const float fSrcX = gC_SrcX[angle];
+	const float fSrcY = gC_SrcY[angle];
+	const float fSrcZ = gC_SrcZ[angle];
+	const float fDetUX = gC_DetUX[angle];
+	const float fDetUY = gC_DetUY[angle];
+	const float fDetUZ = gC_DetUZ[angle];
+	const float fDetVX = gC_DetVX[angle];
+	const float fDetVY = gC_DetVY[angle];
+	const float fDetVZ = gC_DetVZ[angle];
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+	int endDetectorV = startDetectorV + g_detBlockV;
+	if (endDetectorV > dims.iProjV)
+		endDetectorV = dims.iProjV;
+
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > c.nSlices(dims))
+		endSlice = c.nSlices(dims);
+
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+	{
+		/* Trace ray from Src to (detectorU,detectorV) from */
+		/* X = startSlice to X = endSlice                   */
+
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;
+
+		/*        (x)   ( 1)       ( 0) */
+		/* ray:   (y) = (ay) * x + (by) */
+		/*        (z)   (az)       (bz) */
+
+		const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+		const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+		const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ);
+		const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ);
+
+		const float fDistCorr = sc.scale(a1, a2);
+
+		float fVal = 0.0f;
+
+		float f0 = startSlice + 0.5f;
+		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+		for (int s = startSlice; s < endSlice; ++s)
+		{
+
+			float f1f = round(f1) - 0.5f;
+			float f2f = round(f2) - 0.5f;
+
+			textype f1_ = texto(f1);
+			textype f2_ = texto(f2);
+			textype f1f_ = texto(f1f);
+			textype f2f_ = texto(f2f);
+
+			textype fVal0_0; textocheck(fVal0_0, "fp", c.tex(tex, f0, f1f, f2f));
+			textype fVal1_0; textocheck(fVal1_0, "fp", c.tex(tex, f0, f1f + 1.0f, f2f));
+			textype fVal0_1; textocheck(fVal0_1, "fp", c.tex(tex, f0, f1f, f2f + 1.0f));
+			textype fVal1_1; textocheck(fVal1_1, "fp", c.tex(tex, f0, f1f + 1.0f, f2f + 1.0f));
+
+			textype fVal0 = interpolate(fVal0_0, fVal0_1, (f2_ - f2f_));
+			textype fVal1 = interpolate(fVal1_0, fVal1_1, (f2_ - f2f_));
+			fVal += texfrom(interpolate(fVal0, fVal1, (f1_ - f1f_)));
+
+//			fVal += c.tex(tex, f0, f1, f2);
+			f0 += 1.0f;
+			f1 += a1;
+			f2 += a2;
+		}
+
+		fVal *= fDistCorr;
+
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;
+	}
+}
+
+template<class COORD>
+__global__ void cone_FP_SS_t(float* D_projData, unsigned int projPitch,
+                             cudaTextureObject_t tex,
+                             unsigned int startSlice,
+                             unsigned int startAngle, unsigned int endAngle,
+                             const SDimensions3D dims, int iRaysPerDetDim,
+                             SCALE_NONCUBE sc)
+{
+	COORD c;
+
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+	if (angle >= endAngle)
+		return;
+
+	const float fSrcX = gC_SrcX[angle];
+	const float fSrcY = gC_SrcY[angle];
+	const float fSrcZ = gC_SrcZ[angle];
+	const float fDetUX = gC_DetUX[angle];
+	const float fDetUY = gC_DetUY[angle];
+	const float fDetUZ = gC_DetUZ[angle];
+	const float fDetVX = gC_DetVX[angle];
+	const float fDetVY = gC_DetVY[angle];
+	const float fDetVZ = gC_DetVZ[angle];
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+	int endDetectorV = startDetectorV + g_detBlockV;
+	if (endDetectorV > dims.iProjV)
+		endDetectorV = dims.iProjV;
+
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > c.nSlices(dims))
+		endSlice = c.nSlices(dims);
+
+	const float fSubStep = 1.0f/iRaysPerDetDim;
+
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+	{
+		/* Trace ray from Src to (detectorU,detectorV) from */
+		/* X = startSlice to X = endSlice                   */
+
+		float fV = 0.0f;
+
+		float fdU = detectorU - 0.5f + 0.5f*fSubStep;
+		for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) {
+		float fdV = detectorV - 0.5f + 0.5f*fSubStep;
+		for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) {
+
+		const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX;
+		const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY;
+		const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ;
+
+		/*        (x)   ( 1)       ( 0) */
+		/* ray:   (y) = (ay) * x + (by) */
+		/*        (z)   (az)       (bz) */
+
+		const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+		const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ));
+		const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ);
+		const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ);
+
+		const float fDistCorr = sc.scale(a1, a2);
+
+		float fVal = 0.0f;
+
+		float f0 = startSlice + 0.5f;
+		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+		for (int s = startSlice; s < endSlice; ++s)
+		{
+			fVal += c.tex(tex, f0, f1, f2);
+			f0 += 1.0f;
+			f1 += a1;
+			f2 += a2;
+		}
+
+		fVal *= fDistCorr;
+		fV += fVal;
+
+		}
+		}
+
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim);
+	}
+}
+
+
+bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
+                  cudaTextureObject_t D_texObj,
+                  const SDimensions3D& dims,
+                  unsigned int angleCount, const SConeProjection* angles,
+                  const SProjectorParams3D& params)
+{
+	if (!transferConstants(angles, angleCount))
+		return false;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	bool cube = true;
+	if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001)
+		cube = false;
+	if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001)
+		cube = false;
+
+	SCALE_CUBE scube;
+	scube.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+	SCALE_NONCUBE snoncubeX;
+	float fS1 = params.fVolScaleY / params.fVolScaleX;
+	snoncubeX.fScale1 = fS1 * fS1;
+	float fS2 = params.fVolScaleZ / params.fVolScaleX;
+	snoncubeX.fScale2 = fS2 * fS2;
+	snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+	SCALE_NONCUBE snoncubeY;
+	fS1 = params.fVolScaleX / params.fVolScaleY;
+	snoncubeY.fScale1 = fS1 * fS1;
+	fS2 = params.fVolScaleZ / params.fVolScaleY;
+	snoncubeY.fScale2 = fS2 * fS2;
+	snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY;
+
+	SCALE_NONCUBE snoncubeZ;
+	fS1 = params.fVolScaleX / params.fVolScaleZ;
+	snoncubeZ.fScale1 = fS1 * fS1;
+	fS2 = params.fVolScaleY / params.fVolScaleZ;
+	snoncubeZ.fScale2 = fS2 * fS2;
+	snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ;
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= angleCount; ++a) {
+		int dir = -1;
+		if (a != angleCount) {
+			float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f));
+			float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f));
+			float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f));
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == angleCount || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+							if (cube)
+								cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+							else
+								cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeX);
+						else
+							cone_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX);
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+							if (cube)
+								cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+							else
+								cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeY);
+						else
+							cone_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY);
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+							if (cube)
+								cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+							else
+								cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeZ);
+						else
+							cone_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ);
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	bool ok = true;
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp");
+		cudaStreamDestroy(*iter);
+	}
+
+	// printf("%f\n", toc(t));
+
+	return ok;
+}
+
+
+bool ConeFP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SConeProjection* angles,
+            const SProjectorParams3D& params)
+{
+	// transfer volume to array
+	cudaArray* cuArray = allocateVolumeArray(dims);
+	transferVolumeToArray(D_volumeData, cuArray, dims);
+
+	cudaTextureObject_t D_texObj;
+	if (!createTextureObject3D(cuArray, D_texObj)) {
+		cudaFreeArray(cuArray);
+		return false;
+	}
+
+	bool ret;
+
+	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
+		unsigned int iEndAngle = iAngle + g_MaxAngles;
+		if (iEndAngle >= dims.iProjAngles)
+			iEndAngle = dims.iProjAngles;
+
+		cudaPitchedPtr D_subprojData = D_projData;
+		D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch;
+
+		ret = ConeFP_Array_internal(D_subprojData, D_texObj,
+		                            dims, iEndAngle - iAngle, angles + iAngle,
+		                            params);
+		if (!ret)
+			break;
+	}
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/par3d_bp.cu b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu
new file mode 100644
index 0000000..7958ac9
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu
@@ -0,0 +1,327 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 6;
+
+static const unsigned int g_anglesPerBlock = 32;
+static const unsigned int g_volBlockX = 16;
+static const unsigned int g_volBlockY = 32;
+
+static const unsigned g_MaxAngles = 1024;
+
+struct DevPar3DParams {
+	float4 fNumU;
+	float4 fNumV;
+};
+
+__constant__ DevPar3DParams gC_C[g_MaxAngles];
+__constant__ float gC_scale[g_MaxAngles];
+
+
+#include "rounding.h"
+
+template<unsigned int ZSIZE>
+__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles - angleOffset)
+		endAngle = dims.iProjAngles - angleOffset;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+	//            y = z
+
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+	float Z[ZSIZE];
+	for(int i=0; i < ZSIZE; i++)
+		Z[i] = 0.0f;
+
+	{
+		float fAngle = startAngle + angleOffset + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+
+			float4 fCu = gC_C[angle].fNumU;
+			float4 fCv = gC_C[angle].fNumV;
+			float fS = gC_scale[angle];
+
+			float fU = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z;
+			float fV = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z;
+//			printf("%f %f\n", fU, fV);
+
+			for (int idx = 0; idx < ZSIZE; ++idx) {
+				float fVal;
+				textype h5 = texto(0.5f);
+				textype fU_ = texto(fU);
+				textype fUf_ = texto(floor(fU));
+				float fUf = floor(fU);
+				
+				if ((fU - fUf) < 0.5f) {
+				    textype fVal1 = texto(tex3D<float>(tex, fUf - 0.5f, fAngle, fV));
+				    textype fVal2 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV));
+				    fVal = texfrom(fVal1 + (fU_ + h5 - fUf_) * (fVal2 - fVal1));
+				} else {
+				    textype fVal1 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV));
+				    textype fVal2 = texto(tex3D<float>(tex, fUf + 1.5f, fAngle, fV));
+				    fVal = texfrom(fVal1 + (fU_ - h5 - fUf_) * (fVal2 - fVal1));
+				}
+
+//				float fVal = tex3D<float>(tex, fU, fAngle, fV);
+				Z[idx] += fVal * fS;
+
+				fU += fCu.z;
+				fV += fCv.z;
+			}
+
+		}
+	}
+
+	int endZ = ZSIZE;
+	if (endZ > dims.iVolZ - startZ)
+		endZ = dims.iVolZ - startZ;
+
+	for(int i=0; i < endZ; i++)
+		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale;
+}
+
+// supersampling version
+__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles - angleOffset)
+		endAngle = dims.iProjAngles - angleOffset;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+    //            y = z
+
+
+	// TO TRY: precompute part of detector intersection formulas in shared mem?
+	// TO TRY: inner loop over z, gather ray values in shared mem
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	int endZ = startZ + g_volBlockZ;
+	if (endZ > dims.iVolZ)
+		endZ = dims.iVolZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim;
+
+	const float fSubStep = 1.0f/iRaysPerVoxelDim;
+
+	fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim);
+
+
+	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+	{
+
+		float fVal = 0.0f;
+		float fAngle = startAngle + angleOffset + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+			float4 fCu = gC_C[angle].fNumU;
+			float4 fCv = gC_C[angle].fNumV;
+			float fS = gC_scale[angle];
+
+			float fXs = fX;
+			for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) {
+			float fYs = fY;
+			for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) {
+			float fZs = fZ;
+			for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) {
+
+				const float fU = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z;
+				const float fV = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z;
+
+				fVal += tex3D<float>(tex, fU, fAngle, fV) * fS;
+				fZs += fSubStep;
+			}
+			fYs += fSubStep;
+			}
+			fXs += fSubStep;
+			}
+
+		}
+
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale;
+	}
+
+}
+
+bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params)
+{
+	DevPar3DParams *p = new DevPar3DParams[iProjAngles];
+	float *s = new float[iProjAngles];
+
+	for (unsigned int i = 0; i < iProjAngles; ++i) {
+		Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ);
+		Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ);
+		Vec3 r(angles[i].fRayX, angles[i].fRayY, angles[i].fRayZ);
+		Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ);
+
+		double fDen = det3(r,u,v);
+		p[i].fNumU.x = -det3x(r,v) / fDen;
+		p[i].fNumU.y = -det3y(r,v) / fDen;
+		p[i].fNumU.z = -det3z(r,v) / fDen;
+		p[i].fNumU.w = -det3(r,d,v) / fDen;
+		p[i].fNumV.x = det3x(r,u) / fDen;
+		p[i].fNumV.y = det3y(r,u) / fDen;
+		p[i].fNumV.z = det3z(r,u) / fDen;
+		p[i].fNumV.w = det3(r,d,u) / fDen;
+
+		s[i] = 1.0 / scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm();
+	}
+
+	cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevPar3DParams), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(gC_scale, s, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+	delete[] p;
+	delete[] s;
+
+	return true;
+}
+
+bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
+                   cudaArray *D_projArray,
+                   const SDimensions3D& dims, const SPar3DProjection* angles,
+                   const SProjectorParams3D& params)
+{
+	cudaTextureObject_t D_texObj;
+	if (!createTextureObject3D(D_projArray, D_texObj))
+		return false;
+
+	float fOutputScale = params.fOutputScale * params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ;
+
+	bool ok = true;
+
+	for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) {
+		unsigned int angleCount = g_MaxAngles;
+		if (th + angleCount > dims.iProjAngles)
+			angleCount = dims.iProjAngles - th;
+
+		ok = transferConstants(angles, angleCount, params);
+		if (!ok)
+			break;
+
+		dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+		dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+		// timeval t;
+		// tic(t);
+
+		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
+			// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
+			if (params.iRaysPerVoxelDim == 1) {
+				if (dims.iVolZ == 1) {
+					dev_par3D_BP<1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+				} else {
+					dev_par3D_BP<g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale);
+				}
+			} else
+				dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
+		}
+
+		// TODO: Consider not synchronizing here, if possible.
+		ok = checkCuda(cudaThreadSynchronize(), "cone_bp");
+		if (!ok)
+			break;
+
+		angles = angles + angleCount;
+		// printf("%f\n", toc(t));
+
+	}
+
+	cudaDestroyTextureObject(D_texObj);
+
+	return true;
+}
+
+bool Par3DBP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SPar3DProjection* angles,
+            const SProjectorParams3D& params)
+{
+	// transfer projections to array
+
+	cudaArray* cuArray = allocateProjectionArray(dims);
+	transferProjectionsToArray(D_projData, cuArray, dims);
+
+	bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles, params);
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/par3d_fp.cu b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu
new file mode 100644
index 0000000..075784b
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu
@@ -0,0 +1,770 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
+
+Contact: astra@astra-toolbox.com
+Website: http://www.astra-toolbox.com/
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+*/
+
+#include "astra/cuda/3d/util3d.h"
+#include "astra/cuda/3d/dims3d.h"
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 32;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_RayX[g_MaxAngles];
+__constant__ float gC_RayY[g_MaxAngles];
+__constant__ float gC_RayZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+// x=0, y=1, z=2
+struct DIR_X {
+	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; }
+	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; }
+	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+	__device__ float c0(float x, float y, float z) const { return x; }
+	__device__ float c1(float x, float y, float z) const { return y; }
+	__device__ float c2(float x, float y, float z) const { return z; }
+	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); }
+	__device__ float x(float f0, float f1, float f2) const { return f0; }
+	__device__ float y(float f0, float f1, float f2) const { return f1; }
+	__device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// y=0, x=1, z=2
+struct DIR_Y {
+	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; }
+	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; }
+	__device__ float c0(float x, float y, float z) const { return y; }
+	__device__ float c1(float x, float y, float z) const { return x; }
+	__device__ float c2(float x, float y, float z) const { return z; }
+	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); }
+	__device__ float x(float f0, float f1, float f2) const { return f1; }
+	__device__ float y(float f0, float f1, float f2) const { return f0; }
+	__device__ float z(float f0, float f1, float f2) const { return f2; }
+};
+
+// z=0, x=1, y=2
+struct DIR_Z {
+	__device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; }
+	__device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; }
+	__device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; }
+	__device__ float c0(float x, float y, float z) const { return z; }
+	__device__ float c1(float x, float y, float z) const { return x; }
+	__device__ float c2(float x, float y, float z) const { return y; }
+	__device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); }
+	__device__ float x(float f0, float f1, float f2) const { return f1; }
+	__device__ float y(float f0, float f1, float f2) const { return f2; }
+	__device__ float z(float f0, float f1, float f2) const { return f0; }
+};
+
+struct SCALE_CUBE {
+	float fOutputScale;
+	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; }
+};
+
+struct SCALE_NONCUBE {
+	float fScale1;
+	float fScale2;
+	float fOutputScale;
+	__device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; }
+};
+
+bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles)
+{
+	// transfer angles to constant memory
+	float* tmp = new float[iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(RayX);
+	TRANSFER_TO_CONSTANT(RayY);
+	TRANSFER_TO_CONSTANT(RayZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	return true;
+}
+
+
+// threadIdx: x = u detector
+//            y = relative angle
+// blockIdx:  x = u/v detector
+//            y = angle block
+
+#include "rounding.h"
+
+template<class COORD, class SCALE>
+__global__ void par3D_FP_t(float* D_projData, unsigned int projPitch,
+                           cudaTextureObject_t tex,
+                           unsigned int startSlice,
+                           unsigned int startAngle, unsigned int endAngle,
+                           const SDimensions3D dims,
+                           SCALE sc)
+{
+	COORD c;
+
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+	if (angle >= endAngle)
+		return;
+
+	const float fRayX = gC_RayX[angle];
+	const float fRayY = gC_RayY[angle];
+	const float fRayZ = gC_RayZ[angle];
+	const float fDetUX = gC_DetUX[angle];
+	const float fDetUY = gC_DetUY[angle];
+	const float fDetUZ = gC_DetUZ[angle];
+	const float fDetVX = gC_DetVX[angle];
+	const float fDetVY = gC_DetVY[angle];
+	const float fDetVZ = gC_DetVZ[angle];
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+	const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+	const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+	const float fDistCorr = sc.scale(a1, a2);
+
+
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+	int endDetectorV = startDetectorV + g_detBlockV;
+	if (endDetectorV > dims.iProjV)
+		endDetectorV = dims.iProjV;
+
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > c.nSlices(dims))
+		endSlice = c.nSlices(dims);
+
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+	{
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */
+		/* X = startSlice to X = endSlice                            */
+
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;
+
+		/*        (x)   ( 1)       ( 0)    */
+		/* ray:   (y) = (ay) * x + (by)    */
+		/*        (z)   (az)       (bz)    */
+
+		const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ);
+		const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ);
+
+		float fVal = 0.0f;
+
+		float f0 = startSlice + 0.5f;
+		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+		//printf("%f, %f (%f), %f (%f)\n", f0, f1, a1, f2, a2); // Only f1 non linear
+
+		for (int s = startSlice; s < endSlice; ++s)
+		{
+
+			textype h5 = texto(0.5f);
+			textype f1_ = texto(f1);
+			textype f1f_ = texto(floor(f1));
+			float f1f = floor(f1);
+				
+			if ((f1 - f1f) < 0.5f) {
+			    textype fVal1 = texto(c.tex(tex, f0, f1f - 0.5f, f2));
+			    textype fVal2 = texto(c.tex(tex, f0, f1f + 0.5f, f2));
+			    fVal += texfrom(fVal1 + (f1_ + h5 - f1f_) * (fVal2 - fVal1));
+//			    fVal += texfrom(__hfma(__hadd(h5,__hsub(f1_, f1f_)), __hsub(fVal2, fVal1), fVal1));
+			} else {
+			    textype fVal1 = texto(c.tex(tex, f0, f1f + 0.5f, f2));
+			    textype fVal2 = texto(c.tex(tex, f0, f1f + 1.5f, f2));
+			    fVal += texfrom(fVal1 + (f1_ - h5 - f1f_) * (fVal2 - fVal1));
+			}
+
+//			fVal += c.tex(tex, f0, f1, f2);
+			f0 += 1.0f;
+			f1 += a1;
+			f2 += a2;
+		}
+
+		fVal *= fDistCorr;
+
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;
+	}
+}
+
+// Supersampling version
+template<class COORD>
+__global__ void par3D_FP_SS_t(float* D_projData, unsigned int projPitch,
+                              cudaTextureObject_t tex,
+                              unsigned int startSlice,
+                              unsigned int startAngle, unsigned int endAngle,
+                              const SDimensions3D dims, int iRaysPerDetDim,
+                              SCALE_NONCUBE sc)
+{
+	COORD c;
+
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+	if (angle >= endAngle)
+		return;
+
+	const float fRayX = gC_RayX[angle];
+	const float fRayY = gC_RayY[angle];
+	const float fRayZ = gC_RayZ[angle];
+	const float fDetUX = gC_DetUX[angle];
+	const float fDetUY = gC_DetUY[angle];
+	const float fDetUZ = gC_DetUZ[angle];
+	const float fDetVX = gC_DetVX[angle];
+	const float fDetVY = gC_DetVY[angle];
+	const float fDetVZ = gC_DetVZ[angle];
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+	const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+	const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+	const float fDistCorr = sc.scale(a1, a2);
+
+
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+	int endDetectorV = startDetectorV + g_detBlockV;
+	if (endDetectorV > dims.iProjV)
+		endDetectorV = dims.iProjV;
+
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > c.nSlices(dims))
+		endSlice = c.nSlices(dims);
+
+	const float fSubStep = 1.0f/iRaysPerDetDim;
+
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+	{
+
+		float fV = 0.0f;
+
+		float fdU = detectorU - 0.5f + 0.5f*fSubStep;
+		for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) {
+		float fdV = detectorV - 0.5f + 0.5f*fSubStep;
+		for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) {
+
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */
+		/* X = startSlice to X = endSlice                            */
+
+		const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX;
+		const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY;
+		const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ;
+
+		/*        (x)   ( 1)       ( 0)    */
+		/* ray:   (y) = (ay) * x + (by)    */
+		/*        (z)   (az)       (bz)    */
+
+		const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ);
+		const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ);
+
+
+		float fVal = 0.0f;
+
+		float f0 = startSlice + 0.5f;
+		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+		for (int s = startSlice; s < endSlice; ++s)
+		{
+			fVal += c.tex(tex, f0, f1, f2);
+
+			f0 += 1.0f;
+			f1 += a1;
+			f2 += a2;
+		}
+
+		fV += fVal;
+
+		}
+		}
+
+		fV *= fDistCorr;
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim);
+	}
+}
+
+
+__device__ float dirWeights(float fX, float fN) {
+	if (fX <= -0.5f) // outside image on left
+		return 0.0f;
+	if (fX <= 0.5f) // half outside image on left
+		return (fX + 0.5f) * (fX + 0.5f);
+	if (fX <= fN - 0.5f) { // inside image
+		float t = fX + 0.5f - floorf(fX + 0.5f);
+		return t*t + (1-t)*(1-t);
+	}
+	if (fX <= fN + 0.5f) // half outside image on right
+		return (fN + 0.5f - fX) * (fN + 0.5f - fX);
+	return 0.0f; // outside image on right
+}
+
+template<class COORD>
+__global__ void par3D_FP_SumSqW_t(float* D_projData, unsigned int projPitch,
+                                  unsigned int startSlice,
+                                  unsigned int startAngle, unsigned int endAngle,
+                                  const SDimensions3D dims,
+                                  SCALE_NONCUBE sc)
+{
+	COORD c;
+
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;
+	if (angle >= endAngle)
+		return;
+
+	const float fRayX = gC_RayX[angle];
+	const float fRayY = gC_RayY[angle];
+	const float fRayZ = gC_RayZ[angle];
+	const float fDetUX = gC_DetUX[angle];
+	const float fDetUY = gC_DetUY[angle];
+	const float fDetUZ = gC_DetUZ[angle];
+	const float fDetVX = gC_DetVX[angle];
+	const float fDetVY = gC_DetVY[angle];
+	const float fDetVZ = gC_DetVZ[angle];
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
+
+	const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+	const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
+	const float fDistCorr = sc.scale(a1, a2);
+
+
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+	int endDetectorV = startDetectorV + g_detBlockV;
+	if (endDetectorV > dims.iProjV)
+		endDetectorV = dims.iProjV;
+
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > c.nSlices(dims))
+		endSlice = c.nSlices(dims);
+
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+	{
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */
+		/* X = startSlice to X = endSlice                            */
+
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;
+
+		/*        (x)   ( 1)       ( 0)    */
+		/* ray:   (y) = (ay) * x + (by)    */
+		/*        (z)   (az)       (bz)    */
+
+		const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ);
+		const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ);
+
+		float fVal = 0.0f;
+
+		float f0 = startSlice + 0.5f;
+		float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f;
+		float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f;
+
+		for (int s = startSlice; s < endSlice; ++s)
+		{
+			fVal += dirWeights(f1, c.nDim1(dims)) * dirWeights(f2, c.nDim2(dims));
+			f0 += 1.0f;
+			f1 += a1;
+			f2 += a2;
+		}
+
+		fVal *= fDistCorr * fDistCorr;
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;
+	}
+}
+
+// Supersampling version
+// TODO
+
+
+bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
+                   cudaTextureObject_t D_texObj,
+                   const SDimensions3D& dims,
+                   unsigned int angleCount, const SPar3DProjection* angles,
+                   const SProjectorParams3D& params)
+{
+	if (!transferConstants(angles, angleCount))
+		return false;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	bool cube = true;
+	if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001)
+		cube = false;
+	if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001)
+		cube = false;
+
+	SCALE_CUBE scube;
+	scube.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+	SCALE_NONCUBE snoncubeX;
+	float fS1 = params.fVolScaleY / params.fVolScaleX;
+	snoncubeX.fScale1 = fS1 * fS1;
+	float fS2 = params.fVolScaleZ / params.fVolScaleX;
+	snoncubeX.fScale2 = fS2 * fS2;
+	snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+	SCALE_NONCUBE snoncubeY;
+	fS1 = params.fVolScaleX / params.fVolScaleY;
+	snoncubeY.fScale1 = fS1 * fS1;
+	fS2 = params.fVolScaleY / params.fVolScaleY;
+	snoncubeY.fScale2 = fS2 * fS2;
+	snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY;
+
+	SCALE_NONCUBE snoncubeZ;
+	fS1 = params.fVolScaleX / params.fVolScaleZ;
+	snoncubeZ.fScale1 = fS1 * fS1;
+	fS2 = params.fVolScaleY / params.fVolScaleZ;
+	snoncubeZ.fScale2 = fS2 * fS2;
+	snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ;
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= angleCount; ++a) {
+		int dir = -1;
+		if (a != angleCount) {
+			float dX = fabsf(angles[a].fRayX);
+			float dY = fabsf(angles[a].fRayY);
+			float dZ = fabsf(angles[a].fRayZ);
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == angleCount || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+								if (cube)
+										par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube);
+								else
+										par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, snoncubeX);
+						else
+							par3D_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX);
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+								if (cube)
+										par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, scube);
+								else
+										par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, snoncubeY);
+						else
+							par3D_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY);
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+								if (cube)
+										par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, scube);
+								else
+										par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, snoncubeZ);
+						else
+							par3D_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float),  D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ);
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	bool ok = true;
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp");
+		cudaStreamDestroy(*iter);
+	}
+
+	// printf("%f\n", toc(t));
+
+	return ok;
+}
+
+bool Par3DFP(cudaPitchedPtr D_volumeData,
+             cudaPitchedPtr D_projData,
+             const SDimensions3D& dims, const SPar3DProjection* angles,
+             const SProjectorParams3D& params)
+{
+
+	// transfer volume to array
+	cudaArray* cuArray = allocateVolumeArray(dims);
+	transferVolumeToArray(D_volumeData, cuArray, dims);
+
+	cudaTextureObject_t D_texObj;
+	if (!createTextureObject3D(cuArray, D_texObj)) {
+		cudaFreeArray(cuArray);
+		return false;
+	}
+
+	bool ret;
+
+	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
+		unsigned int iEndAngle = iAngle + g_MaxAngles;
+		if (iEndAngle >= dims.iProjAngles)
+			iEndAngle = dims.iProjAngles;
+
+		cudaPitchedPtr D_subprojData = D_projData;
+		D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch;
+
+		ret = Par3DFP_Array_internal(D_subprojData, D_texObj,
+		                             dims, iEndAngle - iAngle, angles + iAngle,
+		                             params);
+		if (!ret)
+			break;
+	}
+
+	cudaFreeArray(cuArray);
+
+	cudaDestroyTextureObject(D_texObj);
+
+	return ret;
+}
+
+
+
+bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
+                    cudaPitchedPtr D_projData,
+                    const SDimensions3D& dims, const SPar3DProjection* angles,
+                    const SProjectorParams3D& params)
+{
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(RayX);
+	TRANSFER_TO_CONSTANT(RayY);
+	TRANSFER_TO_CONSTANT(RayZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	SCALE_NONCUBE snoncubeX;
+	float fS1 = params.fVolScaleY / params.fVolScaleX;
+	snoncubeX.fScale1 = fS1 * fS1;
+	float fS2 = params.fVolScaleZ / params.fVolScaleX;
+	snoncubeX.fScale2 = fS2 * fS2;
+	snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX;
+
+	SCALE_NONCUBE snoncubeY;
+	fS1 = params.fVolScaleX / params.fVolScaleY;
+	snoncubeY.fScale1 = fS1 * fS1;
+	fS2 = params.fVolScaleY / params.fVolScaleY;
+	snoncubeY.fScale2 = fS2 * fS2;
+	snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY;
+
+	SCALE_NONCUBE snoncubeZ;
+	fS1 = params.fVolScaleX / params.fVolScaleZ;
+	snoncubeZ.fScale1 = fS1 * fS1;
+	fS2 = params.fVolScaleY / params.fVolScaleZ;
+	snoncubeZ.fScale2 = fS2 * fS2;
+	snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ;
+
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		int dir;
+		if (a != dims.iProjAngles) {
+			float dX = fabsf(angles[a].fRayX);
+			float dY = fabsf(angles[a].fRayY);
+			float dZ = fabsf(angles[a].fRayZ);
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == dims.iProjAngles || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeX);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeY);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (params.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeZ);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	bool ok = true;
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW");
+		cudaStreamDestroy(*iter);
+	}
+
+	// printf("%f\n", toc(t));
+
+	return ok;
+}
+
+
+
+
+
+
+
+}
diff --git a/patches/astra-toolbox-approximate-projectors/rounding.h b/patches/astra-toolbox-approximate-projectors/rounding.h
new file mode 100644
index 0000000..c1cbffb
--- /dev/null
+++ b/patches/astra-toolbox-approximate-projectors/rounding.h
@@ -0,0 +1,50 @@
+#include <cuda_fp16.h>
+
+#define precision 8
+#define approximate_interpolation
+
+#ifdef approximate_interpolation
+# ifdef precision
+#  if precision == 16
+#   define texto(v) __float2half(v)
+#   define texfrom(v) __half2float(v)
+#   define textype half
+#   define interpolate(v0, v1, pos) (v0 + pos*(v1-v0))
+#   define textocheck(var,msg,val) var=texto(val);
+#  else
+#   define precision_mult ((1<<precision)-1)
+#   define texto(v) ((unsigned char)(precision_mult*v))
+#   define texfrom(v) (1.f * v / precision_mult)
+#   define textype unsigned
+#   define interpolate(v0, v1, pos) (v0 + ((unsigned)(256 * pos)) * (v1 - v0) / 256)
+#   define textocheck(var, msg, val) \
+	if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \
+	var=texto(val);
+#  endif
+# else
+#  define texto(v) (v)
+#  define texfrom(v) (v)
+#  define textype float
+#  define interpolate(v0, v1, pos) (v0 + pos*(v1-v0))
+#  define textocheck(var,msg,val) var=texto(val);
+# endif
+#else
+# ifdef precision
+#  if precision == 16
+#   define texto(v) __half2float(__float2half(v))
+#   define textocheck(var,msg,val) var=texto(val);
+#  else
+#   define precision_mult ((1<<precision)-1)
+#   define texto(v) (floor(precision_mult*v)/precision_mult)
+#   define textocheck(var, msg, val) \
+	if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \
+	var=texto(val);
+#  endif
+# else
+#  define texto(v) (v)
+#  define textocheck(var,msg,val) var=texto(val);
+# endif
+# define texfrom(v) (v)
+# define textype float
+# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0))
+#endif
diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt b/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt
new file mode 100644
index 0000000..893abc5
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt
@@ -0,0 +1,141 @@
+#   Copyright 2018 Edoardo Pasca
+#cmake_minimum_required (VERSION 3.0)
+
+project(RGL_core)
+#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py
+
+# The version number.
+
+set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" FORCE)
+
+# conda orchestrated build
+message("CIL_VERSION ${CIL_VERSION}")
+#include (GenerateExportHeader)
+
+## Build the regularisers package as a library
+message("Creating Regularisers as a shared library")
+
+set(CMAKE_BUILD_TYPE "Release")
+
+set (EXTRA_LIBRARIES "")
+if(WIN32)
+  set (FLAGS "/DWIN32 /EHsc /DCCPiCore_EXPORTS /openmp")
+  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT.lib")
+  message("library lib: ${LIBRARY_LIB}")
+elseif(APPLE)
+  set (FLAGS "-DCCPiReconstructionIterative_EXPORTS ")
+elseif(UNIX)
+   set (FLAGS "-O3 -funsigned-char -Wall  -Wl,--no-undefined  -DCCPiReconstructionIterative_EXPORTS")
+   set(EXTRA_LIBRARIES "m")
+endif()
+
+#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+
+#set(CMAKE_C_COMPILER clang)
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp")
+
+#set(CMAKE_C_COMPILER gcc-9)
+set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp")
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
+
+  set (CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}")
+message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
+
+
+## Build the regularisers package as a library
+message("Adding regularisers as a shared library")
+
+add_library(cilreg SHARED
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffus4th_order_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_ROF_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c
+	    )
+target_link_libraries(cilreg ${OpenMP_EXE_LINKER_FLAGS} ${EXTRA_LIBRARIES})
+#set (CMAKE_SHARED_LINKER_FLAGS "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
+#target_link_options(cilreg PUBLIC 
+include_directories(cilreg PUBLIC
+                      ${LIBRARY_INC}/include
+					  ${CMAKE_CURRENT_SOURCE_DIR}
+		              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ )
+
+## Install
+
+if (UNIX)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
+install(TARGETS cilreg
+	LIBRARY DESTINATION lib
+	CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+	)
+elseif(WIN32)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin")
+  install(TARGETS cilreg
+	RUNTIME DESTINATION bin
+	ARCHIVE DESTINATION lib
+	CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+	)
+endif()
+
+
+
+# GPU Regularisers
+if (BUILD_CUDA)
+    find_package(CUDA)
+    if (CUDA_FOUND)
+      set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES")
+      message("CUDA FLAGS ${CUDA_NVCC_FLAGS}")
+      CUDA_ADD_LIBRARY(cilregcuda SHARED
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu
+      )
+      if (UNIX)
+        message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
+        install(TARGETS cilregcuda
+        LIBRARY DESTINATION lib
+        CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+        )
+      elseif(WIN32)
+        message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin")
+        install(TARGETS cilregcuda
+        RUNTIME DESTINATION bin
+        ARCHIVE DESTINATION lib
+        CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+        )
+      endif()
+    else()
+      message("CUDA NOT FOUND")
+    endif()
+endif()
+
+if (${BUILD_MATLAB_WRAPPER})
+  if (WIN32)
+        install(TARGETS cilreg DESTINATION ${MATLAB_DEST})
+        if (CUDA_FOUND)
+            install(TARGETS cilregcuda DESTINATION ${MATLAB_DEST})
+        endif()
+  endif()
+endif()
diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c
new file mode 100644
index 0000000..91fd746
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c
@@ -0,0 +1,266 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2019 Daniil Kazantsev
+ * Copyright 2019 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FGP_TV_core.h"
+
+/* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume
+ * 2. lambdaPar - regularization parameter
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default)
+ *
+ * Output:
+ * [1] Filtered/regularized image/volume
+ * [2] Information vector which contains [iteration no., reached tolerance]
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+
+float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ)
+{
+    int ll;
+    long j, DimTotal;
+    float re, re1;
+    re = 0.0f; re1 = 0.0f;
+    float tk = 1.0f;
+    float tkp1 =1.0f;
+    int count = 0;
+
+    if (dimZ <= 1) {
+        /*2D case */
+        float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
+        DimTotal = (long)(dimX*dimY);
+
+        if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float));
+        P1 = calloc(DimTotal, sizeof(float));
+        P2 = calloc(DimTotal, sizeof(float));
+        P1_prev = calloc(DimTotal, sizeof(float));
+        P2_prev = calloc(DimTotal, sizeof(float));
+        R1 = calloc(DimTotal, sizeof(float));
+        R2 = calloc(DimTotal, sizeof(float));
+
+        /* begin iterations */
+        for(ll=0; ll<iterationsNumb; ll++) {
+
+            if ((epsil != 0.0f)  && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l);
+            /* computing the gradient of the objective function */
+            Obj_func2D(Input, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY));
+
+            /* apply nonnegativity */
+            if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;}
+
+            /*Taking a step towards minus of the gradient*/
+            Grad_func2D(P1, P2, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY));
+
+            /* projection step */
+            Proj_func2D(P1, P2, methodTV, DimTotal);
+
+            /*updating R and t*/
+            tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f;
+            Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, DimTotal);
+
+            /*storing old values*/
+            copyIm(P1, P1_prev, (long)(dimX), (long)(dimY), 1l);
+            copyIm(P2, P2_prev, (long)(dimX), (long)(dimY), 1l);
+            tk = tkp1;
+
+            /* check early stopping criteria */
+            if ((epsil != 0.0f)  && (ll % 5 == 0)) {
+                re = 0.0f; re1 = 0.0f;
+                for(j=0; j<DimTotal; j++)
+                {
+                    re += powf(Output[j] - Output_prev[j],2);
+                    re1 += powf(Output[j],2);
+                }
+                re = sqrtf(re)/sqrtf(re1);
+                if (re < epsil)  count++;
+                if (count > 3) break;
+            }
+        }
+        if (epsil != 0.0f) free(Output_prev);
+        free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2);
+    }
+    else {
+        /*3D case*/
+        float *P1=NULL, *P2=NULL, *P3=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
+        DimTotal = (long)dimX*(long)dimY*(long)dimZ;
+
+        P1 = calloc(DimTotal, sizeof(float));
+        P2 = calloc(DimTotal, sizeof(float));
+        P3 = calloc(DimTotal, sizeof(float));
+
+        R1 = calloc(DimTotal, sizeof(float));
+        R2 = calloc(DimTotal, sizeof(float));
+        R3 = calloc(DimTotal, sizeof(float));
+
+        /* begin iterations */
+        for(ll=0; ll<iterationsNumb; ll++) {
+            /* computing the gradient of the objective function */
+            re = Obj_func3D(Input, Output, R1, R2, R3, lambdaPar, nonneg, (long)(dimX), (long)(dimY), (long)(dimZ));
+
+            tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f;
+	    All_func3D(P1, P2, P3, Output, R1, R2, R3, lambdaPar, tkp1, tk, (long)(dimX), (long)(dimY), (long)(dimZ));
+
+            // calculate norm - stopping rules
+            if ((epsil != 0.0f)  && (ll % 5 == 0)) {
+                // stop if the norm residual is less than the tolerance EPS
+                if (re < epsil)  count++;
+                if (count > 3) break;
+            }
+
+            tk = tkp1;
+        }
+
+        free(P1); free(P2); free(P3); free(R1); free(R2); free(R3);
+//	printf("TV iters %i (of %i)\n", ll, iterationsNumb);
+    }
+
+    /*adding info into info_vector */
+    infovector[0] = (float)(ll);  /*iterations number (if stopped earlier based on tolerance)*/
+    infovector[1] = re;  /* reached tolerance */
+
+    return 0;
+}
+
+float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY)
+{
+    float val1, val2;
+    long i,j,index;
+#pragma omp parallel for shared(A,D,R1,R2) private(index,i,j,val1,val2)
+    for(j=0; j<dimY; j++) {
+        for(i=0; i<dimX; i++) {
+            index = j*dimX+i;
+            /* boundary conditions  */
+            if (i == 0) {val1 = 0.0f;} else {val1 = R1[j*dimX + (i-1)];}
+            if (j == 0) {val2 = 0.0f;} else {val2 = R2[(j-1)*dimX + i];}
+            D[index] = A[index] - lambda*(R1[index] + R2[index] - val1 - val2);
+        }}
+    return *D;
+}
+float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda,  long dimX, long dimY)
+{
+    float val1, val2, multip;
+    long i,j,index;
+    multip = (1.0f/(8.0f*lambda));
+#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(index,i,j,val1,val2)
+    for(j=0; j<dimY; j++) {
+        for(i=0; i<dimX; i++) {
+            index = j*dimX+i;
+            /* boundary conditions */
+            if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[j*dimX + (i+1)];
+            if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(j+1)*dimX + i];
+            P1[index] = R1[index] + multip*val1;
+            P2[index] = R2[index] + multip*val2;
+        }}
+    return 1;
+}
+float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, long DimTotal)
+{
+    long i;
+    float multip;
+    multip = ((tk-1.0f)/tkp1);
+#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i)
+    for(i=0; i<DimTotal; i++) {
+        R1[i] = P1[i] + multip*(P1[i] - P1_old[i]);
+        R2[i] = P2[i] + multip*(P2[i] - P2_old[i]);
+    }
+    return 1;
+}
+
+    // too much threads poisoning caches. there is a sweet spot
+#define n_threads 16
+
+/* 3D-case related Functions */
+/*****************************************************************/
+float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int nonneg, long dimX, long dimY, long dimZ)
+{
+    float D_new;
+    float val1, val2, val3;
+    float re = 0.0f, re1 = 0.0f;
+    long i,j,k,index;
+
+#pragma omp parallel for reduction(+ : re, re1) shared(A,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3,D_new) num_threads(n_threads)
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<dimY; j++) {
+            for(i=0; i<dimX; i++) {
+                index = (dimX*dimY)*k + j*dimX+i;
+                /* boundary conditions */
+                val1 = (i == 0)?0.f:R1[(dimX*dimY)*k + j*dimX + (i-1)];
+                val2 = (j == 0)?0.f:R2[(dimX*dimY)*k + (j-1)*dimX + i];
+                val3 = (k == 0)?0.f:R3[(dimX*dimY)*(k-1) + j*dimX + i];
+
+                D_new = A[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3);
+		if ((nonneg == 1)&&(D_new < 0.0f)) D_new = 0.0f;
+
+		re += powf(D_new - D[index], 2);
+		re1 += powf(D_new, 2);
+		D[index] = D_new;
+            }}}
+	
+    return sqrtf(re)/sqrtf(re1);
+}
+
+float All_func3D(float *P1_old, float *P2_old, float *P3_old, float *D, float *R1, float *R2, float *R3, float lambda, float tkp1, float tk, long dimX, long dimY, long dimZ)
+{
+    float val1, val2, val3, denom, sq_denom;
+    float P1_new, P2_new, P3_new;
+    long i,j,k, index;
+    float multip = (1.0f/(26.0f*lambda));
+    float multip2 = ((tk-1.0f)/tkp1);
+
+#pragma omp parallel for shared(P1_old,P2_old,P3_old,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3,denom,sq_denom,multip,multip2,P1_new,P2_new,P3_new) num_threads(n_threads)
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<dimY; j++) {
+            for(i=0; i<dimX; i++) {
+                index = (dimX*dimY)*k + j*dimX+i;
+
+                /* boundary conditions */
+                val1 = (i == dimX-1)?0.0f: D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)];
+                val2 = (j == dimY-1)?0.0f: D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i];
+                val3 = (k == dimZ-1)?0.0f: D[index] - D[(dimX*dimY)*(k+1) + j*dimX + i];
+
+                P1_new = R1[index] + multip*val1;
+                P2_new = R2[index] + multip*val2;
+                P3_new = R3[index] + multip*val3;
+
+		denom = powf(P1_new,2) + powf(P2_new,2) + powf(P3_new,2);
+		if (denom > 1.0f) {
+		    sq_denom = 1.0f/sqrtf(denom);
+		    P1_new *= sq_denom;
+		    P2_new *= sq_denom;
+		    P3_new *= sq_denom;
+        	}
+
+		R1[index] = P1_new + multip2*(P1_new - P1_old[index]);
+		R2[index] = P2_new + multip2*(P2_new - P2_old[index]);
+		R3[index] = P3_new + multip2*(P3_new - P3_old[index]);
+		
+		P1_old[index] = P1_new;
+		P2_old[index] = P2_new;
+		P3_old[index] = P3_new;
+            }}}
+
+
+    return 1;
+}
diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h
new file mode 100644
index 0000000..cda0f40
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h
@@ -0,0 +1,62 @@
+/*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+//#include <matrix.h>
+#include <math.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <stdio.h>
+#include "omp.h"
+#include "utils.h"
+#include "CCPiDefines.h"
+
+/* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume
+ * 2. lambda - regularization parameter
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant
+ * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 6. nonneg: 'nonnegativity (0 is OFF by default)
+ *
+ * Output:
+ * [1] Filtered/regularized image/volume
+ * [2] Information vector which contains [iteration no., reached tolerance]
+
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ);
+
+CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY);
+CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, long dimX, long dimY);
+CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, long DimTotal);
+
+CCPI_EXPORT float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int nonneg, long dimX, long dimY, long dimZ);
+CCPI_EXPORT float All_func3D(float *P1_old, float *P2_old, float *P3_old, float *D, float *R1, float *R2, float *R3, float lambda, float tkp1, float tk, long dimX, long dimY, long dimZ);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt
new file mode 100644
index 0000000..1a3b37a
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt
@@ -0,0 +1,169 @@
+#   Copyright 2018 Edoardo Pasca
+#cmake_minimum_required (VERSION 3.0)
+
+project(RGL_core)
+#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py
+
+# The version number.
+
+set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" FORCE)
+
+# conda orchestrated build
+message("CIL_VERSION ${CIL_VERSION}")
+#include (GenerateExportHeader)
+
+## Build the regularisers package as a library
+message("Creating Regularisers as a shared library")
+
+find_package(GLIB2 REQUIRED)
+find_package(OpenMP REQUIRED)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+   set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+   set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
+message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer")
+message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}")
+message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}")
+message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}")
+
+set(CMAKE_BUILD_TYPE "Release")
+
+if(WIN32)
+  set (FLAGS "/DWIN32 /EHsc /DCCPiCore_EXPORTS /openmp")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+  set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT.lib")
+
+  set (EXTRA_LIBRARIES)
+
+  message("library lib: ${LIBRARY_LIB}")
+
+elseif(UNIX)
+   set (FLAGS "-O2 -funsigned-char -Wall  -Wl,--no-undefined  -DCCPiReconstructionIterative_EXPORTS ")
+   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+   set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+
+   set (EXTRA_LIBRARIES
+		"gomp"
+		"m"
+		)
+
+endif()
+message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
+
+## Build the regularisers package as a library
+message("Adding regularisers as a shared library")
+
+#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+
+#set(CMAKE_C_COMPILER clang)
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp")
+
+#set(CMAKE_C_COMPILER gcc-9)
+set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp")
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
+
+#set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp")
+#set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC")
+add_library(cilreg SHARED
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffus4th_order_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_ROF_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c
+            ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c
+	    )
+target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} )
+include_directories(cilreg PUBLIC
+                      ${GLIB2_INCLUDE_DIR}
+                      ${LIBRARY_INC}/include
+					  ${CMAKE_CURRENT_SOURCE_DIR}
+		              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/
+		              ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/  )
+
+## Install
+
+if (UNIX)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
+install(TARGETS cilreg
+	LIBRARY DESTINATION lib
+	CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+	)
+elseif(WIN32)
+message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin")
+  install(TARGETS cilreg
+	RUNTIME DESTINATION bin
+	ARCHIVE DESTINATION lib
+	CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+	)
+endif()
+
+
+
+# GPU Regularisers
+if (BUILD_CUDA)
+    find_package(CUDA)
+    if (CUDA_FOUND)
+      set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES -allow-unsupported-compiler")
+      message("CUDA FLAGS ${CUDA_NVCC_FLAGS}")
+      CUDA_ADD_LIBRARY(cilregcuda SHARED
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu
+      )
+      if (UNIX)
+        message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
+        install(TARGETS cilregcuda
+        LIBRARY DESTINATION lib
+        CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+        )
+      elseif(WIN32)
+        message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin")
+        install(TARGETS cilregcuda
+        RUNTIME DESTINATION bin
+        ARCHIVE DESTINATION lib
+        CONFIGURATIONS ${CMAKE_BUILD_TYPE}
+        )
+      endif()
+    else()
+      message("CUDA NOT FOUND")
+    endif()
+endif()
+
+if (${BUILD_MATLAB_WRAPPER})
+  if (WIN32)
+        install(TARGETS cilreg DESTINATION ${MATLAB_DEST})
+        if (CUDA_FOUND)
+            install(TARGETS cilregcuda DESTINATION ${MATLAB_DEST})
+        endif()
+  endif()
+endif()
diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c
new file mode 100755
index 0000000..cbce96a
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c
@@ -0,0 +1,668 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <malloc.h>
+#include "TNV_core.h"
+
+#define BLOCK 32
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef _Float16 floatxx;	// Large arrays, allways float16 if we go mixed-precision.
+//typedef _Float16 floatyy;	// Small arrays which we can do both ways.
+typedef float floatyy;
+
+typedef struct {
+    int offY, stepY, copY;
+    floatxx *Input, *u, *qx, *qy, *gradx, *grady, *div;
+    floatyy *div0, *udiff0;
+    floatyy *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->div);
+    
+    free(ctx->div0);
+    free(ctx->udiff0);
+
+    free(ctx->gradxdiff); 
+    free(ctx->gradydiff);
+    free(ctx->ubarx);
+    free(ctx->ubary);
+    free(ctx->udiff);
+
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+    long DimRow = (long)(dimX * padZ);
+    long DimCell = (long)(padZ);
+
+    // Auxiliar vectors
+    ctx->Input = memalign(64, Dim1Total * sizeof(floatxx));
+    ctx->u = memalign(64, Dim1Total * sizeof(floatxx));
+    ctx->qx = memalign(64, DimTotal * sizeof(floatxx));
+    ctx->qy = memalign(64, DimTotal * sizeof(floatxx));
+    ctx->gradx = memalign(64, DimTotal * sizeof(floatxx));
+    ctx->grady = memalign(64, DimTotal * sizeof(floatxx));
+    ctx->div = memalign(64, Dim1Total * sizeof(floatxx));
+
+    ctx->div0 = memalign(64, DimRow * sizeof(floatyy));
+    ctx->udiff0 = memalign(64, DimRow * sizeof(floatyy));
+
+    ctx->gradxdiff = memalign(64, DimCell * sizeof(floatyy));
+    ctx->gradydiff = memalign(64, DimCell * sizeof(floatyy));
+    ctx->ubarx = memalign(64, DimCell * sizeof(floatyy));
+    ctx->ubary = memalign(64, DimCell * sizeof(floatyy));
+    ctx->udiff = memalign(64, DimCell * sizeof(floatyy));
+    
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(floatxx));
+    memset(ctx->qx, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->qy, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->gradx, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->grady, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->div, 0, Dim1Total * sizeof(floatxx));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(floatxx));
+    memset(ctx->qx, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->qy, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->gradx, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->grady, 0, DimTotal * sizeof(floatxx));
+    memset(ctx->div, 0, Dim1Total * sizeof(floatxx));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l, m;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    floatxx *Input = ctx->Input;
+    floatxx *u = ctx->u;
+    floatxx *qx = ctx->qx;
+    floatxx *qy = ctx->qy;
+    floatxx *gradx = ctx->gradx;
+    floatxx *grady = ctx->grady;
+    floatxx *div = ctx->div;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual1 = 0.0f;
+    float resdual2 = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    floatyy qxdiff;
+    floatyy qydiff;
+    floatyy divdiff;
+    floatyy *gradxdiff = ctx->gradxdiff;
+    floatyy *gradydiff = ctx->gradydiff;
+    floatyy *ubarx = ctx->ubarx;
+    floatyy *ubary = ctx->ubary;
+    floatyy *udiff = ctx->udiff;
+
+    floatyy *udiff0 = ctx->udiff0;
+    floatyy *div0 = ctx->div0;
+
+
+    j = 0; {
+#       define TNV_LOOP_FIRST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_FIRST_J
+    }
+
+
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+    }
+
+    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+            for(int j2 = 0; j2 < BLOCK; j2 ++) {
+                j = j1 + j2;
+                for(int i2 = 0; i2 < BLOCK; i2++) {
+                    i = i1 + i2;
+                    
+                    if (i == (dimX - 1)) break;
+                    if (j == (copY - 1)) { j2 = BLOCK; break; }
+#           include "TNV_core_loop.h"
+                }   
+            }
+        } // i
+
+    }
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+    }
+
+
+
+    for (j = copY - 1; j < stepY; j++) {
+#       define TNV_LOOP_LAST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_LAST_J
+    }
+
+
+
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual1 + resdual2;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+//    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
+    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    int started = 0;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+            m = (ctx0->stepY - 1) * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    floatyy divdiff = ctx->div0[l] - ctx->div[l];
+                    floatyy udiff = ctx->udiff0[l];
+
+                    ctx->div[l] -= ctx0->qy[l + m];
+                    ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+                    ctx0->u[m + l + dimX*padZ] = ctx->u[l];
+                    
+                    divdiff += ctx0->qy[l + m];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    floatyy divdiff = ctx->div0[l] - ctx->div[l];
+                    floatyy udiff = ctx->udiff0[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+//        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            if (started) {
+                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n");
+            } else {
+                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+                if (!err) err = hw_sched_wait_task(sched);
+                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+            }
+        } else {
+            started = 1;
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+//    printf("Return: %f\n", *uT);
+
+    return *uT;
+}
diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h
new file mode 100644
index 0000000..aa050a4
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h
@@ -0,0 +1,47 @@
+#include <math.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <stdio.h>
+#include "omp.h"
+#include "utils.h"
+#include "CCPiDefines.h"
+
+#define fTiny 0.00000001f
+#define fLarge 100000000.0f
+#define INFNORM -1
+
+#define MAX(i,j) ((i)<(j) ? (j):(i))
+#define MIN(i,j) ((i)<(j) ? (i):(j))
+
+/*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+CCPI_EXPORT float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ);
+
+/*float PDHG(float *A, float *B, float tau, float sigma, float theta, float lambda, int p, int q, int r, float tol, int maxIter, int d_c, int d_w, int d_h);*/
+CCPI_EXPORT float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ);
+CCPI_EXPORT float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ);
+CCPI_EXPORT float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ);
+CCPI_EXPORT float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ);
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h
new file mode 100644
index 0000000..86299cd
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h
@@ -0,0 +1,107 @@
+    {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            l = (j * dimX  + i) * padZ;
+            m = dimX * padZ;
+        
+            floatxx *__restrict u_next = u + l + padZ;
+            floatxx *__restrict u_current = u + l;
+            floatxx *__restrict u_next_row = u + l + m;
+
+
+            floatxx *__restrict qx_current = qx + l;
+            floatxx *__restrict qy_current = qy + l;
+            floatxx *__restrict qx_prev = qx + l - padZ;
+            floatxx *__restrict qy_prev = qy + l - m;
+
+
+//  __assume(padZ%16==0);
+
+#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                floatyy u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
+                udiff[k] = u[l + k] - u_upd; // cache 1w
+                u[l + k] = u_upd; // 1 write
+
+#ifdef TNV_LOOP_FIRST_J
+                udiff0[l + k] = udiff[k];
+                div0[l + k] = div[l + k];
+#endif
+
+#ifdef TNV_LOOP_LAST_I
+                floatyy gradx_upd = 0;
+#else
+                floatyy u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads
+                floatyy gradx_upd = (u_next_upd - u_upd); // 2 reads
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+                floatyy grady_upd = 0;
+#else
+                floatyy u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads
+                floatyy grady_upd = (u_next_row_upd - u_upd); // 1 read
+#endif
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w
+                gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w
+                gradx[l + k] = gradx_upd; // 1 write
+                grady[l + k] = grady_upd; // 1 write
+                
+                ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w
+                ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w
+
+                floatyy vx = ubarx[k] + divsigma * qx[l + k]; // 1 read
+                floatyy vy = ubary[k] + divsigma * qy[l + k]; // 1 read
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < padZ; k++) {
+                floatyy vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r
+                floatyy vy = ubary[k] + divsigma * qy_current[k]; // cache 2r
+                floatyy gx_upd = vx * t[0] + vy * t[1];
+                floatyy gy_upd = vx * t[1] + vy * t[2];
+
+                qxdiff = sigma * (ubarx[k] - gx_upd);
+                qydiff = sigma * (ubary[k] - gy_upd);
+                
+                qx_current[k] += qxdiff; // write 1
+                qy_current[k] += qydiff; // write 1
+
+                unorm += (udiff[k] * udiff[k]);
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                floatyy div_upd = 0;
+
+#ifndef TNV_LOOP_FIRST_I
+                div_upd -= qx_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_FIRST_J
+                div_upd -= qy_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_LAST_I
+                div_upd += qx_current[k]; 
+#endif
+#ifndef TNV_LOOP_LAST_J
+                div_upd += qy_current[k]; 
+#endif
+
+                divdiff = div[l + k] - div_upd;  // 1 read
+                div[l + k] = div_upd; // 1 write
+
+#ifndef TNV_LOOP_FIRST_J
+                resprimal += fabs(divtau * udiff[k] + divdiff); 
+#endif
+
+                    // We need to have two independent accumulators to allow gcc-autovectorization
+                resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r
+                resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r
+                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+            }
+    }
+    
\ No newline at end of file
diff --git a/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch b/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch
new file mode 100644
index 0000000..cc53b1d
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch
@@ -0,0 +1,12 @@
+diff -dPNur CCPi-Regularisation-Toolkit-orig/src/Core/regularisers_CPU/FGP_TV_core.c CCPi-Regularisation-Toolkit/src/Core/regularisers_CPU/FGP_TV_core.c
+--- CCPi-Regularisation-Toolkit-orig/src/Core/regularisers_CPU/FGP_TV_core.c	2021-12-17 12:17:12.000000000 +0100
++++ CCPi-Regularisation-Toolkit/src/Core/regularisers_CPU/FGP_TV_core.c	2022-09-06 18:52:32.460225737 +0200
+@@ -104,7 +104,7 @@
+     else {
+         /*3D case*/
+         float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL;
+-        DimTotal = (long)(dimX*dimY*dimZ);
++        DimTotal = (long)dimX*(long)dimY*(long)dimZ;
+ 
+         if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float));
+         P1 = calloc(DimTotal, sizeof(float));
diff --git a/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch b/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch
new file mode 100644
index 0000000..09b5f51
--- /dev/null
+++ b/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch
@@ -0,0 +1,19 @@
+--- a/src/Core/regularisers_CPU/utils.c
++++ b/src/Core/regularisers_CPU/utils.c
+@@ -176,6 +176,7 @@ float Proj_func2D(float *P1, float *P2, int methTV, long DimTotal)
+     }
+     return 1;
+ }
++
+ /*3D Projection onto convex set for P (called in PD_TV, FGP_TV, FGP_dTV methods)*/
+ float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal)
+ {
+@@ -183,7 +184,7 @@ float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal)
+     long i;
+     if (methTV == 0) {
+         /* isotropic TV*/
+-#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,sq_denom)
++#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,denom,sq_denom)
+         for(i=0; i<DimTotal; i++) {
+             denom = powf(P1[i],2) + powf(P2[i],2) + powf(P3[i],2);
+             if (denom > 1.0f) {
-- 
cgit v1.2.1