1334
#define subh_cache(subh) cache[(int)(subh)]
1337
__kernel void hst_backproject_6ppt_oversample4(const int num_proj,
1341
__read_only image2d_t sinogram,
1342
__global float *slice,
1343
__constant float4 *c_all,
1344
__local float *cache
1346
const int tidx = get_local_id(0);
1347
const int tidy = get_local_id(1);
1349
const int bidx = PPT * get_group_id(0) * get_local_size(0);
1350
const int bidy = PPT * (get_global_id(1) - get_local_id(1));
1352
const float bx = bidx + off_x;
1353
const float by = bidy + off_y;
1356
const int sbidx = tidy % 4;
1357
const int sbidy = tidy / 4;
1359
const int stidx = tidx % 4;
1360
const int stidy = tidx / 4;
1362
const int sidx = (sbidx * 4 + stidx);
1363
const int sidy = (sbidy * 4 + stidy);
1366
const int sidx = tidx;
1367
const int sidy = tidy;
1370
const float sx = sidx;
1371
const float sy = sidy;
1373
const int idx = bidx + sidx;
1374
const int idy = bidy + sidy;
1376
const float x = idx + off_x;
1377
const float y = idy + off_y;
1379
const int slice_width = PPT * get_global_size(0);
1381
float res[PPT][PPT] = {0};
1382
for (int proj = 0; proj < num_proj; proj++) {
1383
float4 all = c_all[proj]; all.y = -all.y;
1384
float minh = floor(all.z + bx * all.x + by * all.y + all.w);
1387
for (int i = 0; i < 5; i++) {
1388
int pos = i * get_local_size(0) * get_local_size(0) + tidy * get_local_size(0) + tidx;
1389
cache[pos] = read_imagef(sinogram, volumeSampler, (float2)(minh + 0.25f*pos, proj + .5f)).x;
1392
barrier(CLK_LOCAL_MEM_FENCE);
1396
float h = mad(x, all.x, mad(y, all.y, all.z)) - 4 * minh;
1399
for (int i = 0; i < PPT; i++) {
1401
for (int j = 0; j < PPT; j++) {
1402
res[i][j] += subh_cache(h + (i * get_local_size(0)) * all.y + (j * get_local_size(0)) * all.x);
1406
barrier(CLK_LOCAL_MEM_FENCE);
1410
for (int i = 0; i < PPT; i++) {
1412
for (int j = 0; j < PPT; j++) {
1413
slice[(idy + get_local_size(1) * i) * slice_width + idx + get_local_size(0) * j] = res[i][j];
1419
__kernel void hst_backproject_6ppt_tex(const int num_proj,
1423
__read_only image2d_t sinogram,
1424
__global float *slice,
1425
__constant float4 *c_all/*,
1426
__local float *shared*/
1429
const int tidx = get_local_id(0);
1430
const int tidy = get_local_id(1);
1432
const int sidx = tidx;
1433
const int sidy = tidy;
1435
const int idx = PPT * get_group_id(0) * get_local_size(0) + sidx;
1436
const int idy = PPT * (get_global_id(1) - get_local_id(1)) + sidy;
1438
const float x = idx + off_x;
1439
const float y = idy + off_y;
1441
const int slice_width = PPT * get_global_size(0);
1443
float res[PPT][PPT] = {0};
1445
for (int proj = 0; proj < num_proj; proj++) {
1446
float4 all = c_all[proj];
1448
float subh = mad(x, all.x, mad(y, all.y, all.z));
1451
for (int i = 0; i < PPT; i++) {
1453
for (int j = 0; j < PPT; j++) {
1454
res[i][j] += read_imagef(sinogram, volumeSampler, (float2)(subh + i * get_local_size(1) * all.y + j * get_local_size(0) * all.x, proj + .5f)).x;
1461
for (int i = 0; i < PPT; i++) {
1463
for (int j = 0; j < PPT; j++) {
1464
slice[(idy + get_local_size(1) * i) * slice_width + idx + get_local_size(0) * j] = res[i][j];
1471
__kernel void hst_backproject_6ppt(const int num_proj,
1475
__read_only image2d_t sinogram,
1476
__global float *slice,
1477
__constant float4 *c_all,
1478
__local float *cache) {
1481
const int mode = (get_group_id(1) * get_num_groups(0) + get_group_id(0)) % 5;
1483
hst_backproject_test_oversample4_2(num_proj, num_bins, off_x, off_y, sinogram, slice, c_all, cache);
1485
hst_backproject_test_oversample4_1(num_proj, num_bins, off_x, off_y, sinogram, slice, c_all, cache);
1494
__kernel void hst_backproject_8ppt_vliw_oversample4(const int num_proj,
1498
__read_only image2d_t sinogram,
1499
__global float *slice,
1500
__constant float4 *c_all,
1501
__local float *cache
1503
const int tidx = get_local_id(0);
1504
const int tidy = get_local_id(1);
1506
const int bidx = PPT * get_group_id(0) * get_local_size(0);
1507
const int bidy = PPT * (get_global_id(1) - get_local_id(1));
1509
const float bx = bidx + off_x;
1510
const float by = bidy + off_y;
1512
const int sidx = tidx;
1513
const int sidy = tidy;
1515
const int idx = bidx + sidx;
1516
const int idy = bidy + sidy;
1518
const float x = idx + off_x;
1519
const float y = idy + off_y;
1521
const int slice_width = PPT * get_global_size(0);
1523
float res[PPT][PPT] = {0};
1524
for (int proj = 0; proj < num_proj; proj++) {
1525
float4 all = c_all[proj]; all.y = -all.y;
1526
float minh = floor(all.z + bx * all.x + by * all.y + all.w);
1529
for (int i = 0; i < 6; i++) {
1530
int pos = i * get_local_size(0) * get_local_size(0) + tidy * get_local_size(0) + tidx;
1531
cache[pos] = read_imagef(sinogram, volumeSampler, (float2)(minh + 0.25f*pos, proj + .5f)).x;
1534
barrier(CLK_LOCAL_MEM_FENCE);
1538
float h = mad(x, all.x, mad(y, all.y, all.z)) - 4 * minh;
1541
for (int i = 0; i < PPT; i++) {
1543
for (int j = 0; j < PPT; j++) {
1544
res[i][j] += subh_cache(h + (i * get_local_size(0)) * all.y + (j * get_local_size(0)) * all.x);
1548
barrier(CLK_LOCAL_MEM_FENCE);
1552
for (int i = 0; i < PPT; i++) {
1554
for (int j = 0; j < PPT; j++) {
1555
slice[(idy + get_local_size(1) * i) * slice_width + idx + get_local_size(0) * j] = res[i][j];
1561
__kernel void hst_backproject_8ppt_vliw_oversample4(const int num_proj,
1565
__read_only image2d_t sinogram,
1566
__global float *slice,
1567
__constant float4 *c_all,
1568
__local float *cache
1570
const int tidx = get_local_id(0);
1571
const int tidy = get_local_id(1);
1573
const int bidx = PPT * get_group_id(0) * get_local_size(0);
1574
const int bidy = PPT * (get_global_id(1) - get_local_id(1));
1576
const float bx = bidx + off_x;
1577
const float by = bidy + off_y;
1579
const int sidx = tidx;
1580
const int sidy = tidy;
1582
const int idx = bidx + sidx;
1583
const int idy = bidy + sidy;
1585
const float x = idx + off_x;
1586
const float y = idy + off_y;
1588
const int slice_width = PPT * get_global_size(0);
1655
for (int proj = 0; proj < num_proj; proj++) {
1656
float4 all = c_all[proj]; all.y = -all.y;
1657
float minh = floor(all.z + bx * all.x + by * all.y + all.w);
1660
for (int i = 0; i < 6; i++) {
1661
int pos = i * get_local_size(0) * get_local_size(0) + tidy * get_local_size(0) + tidx;
1662
cache[pos] = read_imagef(sinogram, volumeSampler, (float2)(minh + 0.25f*pos, proj + .5f)).x;
1665
barrier(CLK_LOCAL_MEM_FENCE);
1669
float h = mad(x, all.x, mad(y, all.y, all.z)) - 4 * minh;
1671
res_0_0 += subh_cache(h + (0 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1672
res_0_1 += subh_cache(h + (0 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1673
res_0_2 += subh_cache(h + (0 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1674
res_0_3 += subh_cache(h + (0 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1675
res_0_4 += subh_cache(h + (0 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1676
res_0_5 += subh_cache(h + (0 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1677
res_0_6 += subh_cache(h + (0 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1678
res_0_7 += subh_cache(h + (0 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1679
res_1_0 += subh_cache(h + (1 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1680
res_1_1 += subh_cache(h + (1 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1681
res_1_2 += subh_cache(h + (1 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1682
res_1_3 += subh_cache(h + (1 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1683
res_1_4 += subh_cache(h + (1 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1684
res_1_5 += subh_cache(h + (1 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1685
res_1_6 += subh_cache(h + (1 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1686
res_1_7 += subh_cache(h + (1 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1687
res_2_0 += subh_cache(h + (2 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1688
res_2_1 += subh_cache(h + (2 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1689
res_2_2 += subh_cache(h + (2 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1690
res_2_3 += subh_cache(h + (2 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1691
res_2_4 += subh_cache(h + (2 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1692
res_2_5 += subh_cache(h + (2 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1693
res_2_6 += subh_cache(h + (2 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1694
res_2_7 += subh_cache(h + (2 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1695
res_3_0 += subh_cache(h + (3 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1696
res_3_1 += subh_cache(h + (3 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1697
res_3_2 += subh_cache(h + (3 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1698
res_3_3 += subh_cache(h + (3 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1699
res_3_4 += subh_cache(h + (3 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1700
res_3_5 += subh_cache(h + (3 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1701
res_3_6 += subh_cache(h + (3 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1702
res_3_7 += subh_cache(h + (3 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1703
res_4_0 += subh_cache(h + (4 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1704
res_4_1 += subh_cache(h + (4 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1705
res_4_2 += subh_cache(h + (4 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1706
res_4_3 += subh_cache(h + (4 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1707
res_4_4 += subh_cache(h + (4 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1708
res_4_5 += subh_cache(h + (4 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1709
res_4_6 += subh_cache(h + (4 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1710
res_4_7 += subh_cache(h + (4 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1711
res_5_0 += subh_cache(h + (5 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1712
res_5_1 += subh_cache(h + (5 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1713
res_5_2 += subh_cache(h + (5 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1714
res_5_3 += subh_cache(h + (5 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1715
res_5_4 += subh_cache(h + (5 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1716
res_5_5 += subh_cache(h + (5 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1717
res_5_6 += subh_cache(h + (5 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1718
res_5_7 += subh_cache(h + (5 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1719
res_6_0 += subh_cache(h + (6 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1720
res_6_1 += subh_cache(h + (6 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1721
res_6_2 += subh_cache(h + (6 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1722
res_6_3 += subh_cache(h + (6 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1723
res_6_4 += subh_cache(h + (6 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1724
res_6_5 += subh_cache(h + (6 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1725
res_6_6 += subh_cache(h + (6 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1726
res_6_7 += subh_cache(h + (6 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1727
res_7_0 += subh_cache(h + (7 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x);
1728
res_7_1 += subh_cache(h + (7 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x);
1729
res_7_2 += subh_cache(h + (7 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x);
1730
res_7_3 += subh_cache(h + (7 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x);
1731
res_7_4 += subh_cache(h + (7 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x);
1732
res_7_5 += subh_cache(h + (7 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x);
1733
res_7_6 += subh_cache(h + (7 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x);
1734
res_7_7 += subh_cache(h + (7 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x);
1736
barrier(CLK_LOCAL_MEM_FENCE);
1739
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 0] = res_0_0;
1740
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 1] = res_0_1;
1741
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 2] = res_0_2;
1742
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 3] = res_0_3;
1743
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 4] = res_0_4;
1744
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 5] = res_0_5;
1745
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 6] = res_0_6;
1746
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 7] = res_0_7;
1747
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 0] = res_1_0;
1748
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 1] = res_1_1;
1749
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 2] = res_1_2;
1750
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 3] = res_1_3;
1751
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 4] = res_1_4;
1752
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 5] = res_1_5;
1753
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 6] = res_1_6;
1754
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 7] = res_1_7;
1755
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 0] = res_2_0;
1756
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 1] = res_2_1;
1757
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 2] = res_2_2;
1758
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 3] = res_2_3;
1759
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 4] = res_2_4;
1760
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 5] = res_2_5;
1761
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 6] = res_2_6;
1762
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 7] = res_2_7;
1763
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 0] = res_3_0;
1764
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 1] = res_3_1;
1765
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 2] = res_3_2;
1766
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 3] = res_3_3;
1767
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 4] = res_3_4;
1768
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 5] = res_3_5;
1769
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 6] = res_3_6;
1770
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 7] = res_3_7;
1771
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 0] = res_4_0;
1772
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 1] = res_4_1;
1773
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 2] = res_4_2;
1774
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 3] = res_4_3;
1775
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 4] = res_4_4;
1776
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 5] = res_4_5;
1777
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 6] = res_4_6;
1778
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 7] = res_4_7;
1779
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 0] = res_5_0;
1780
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 1] = res_5_1;
1781
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 2] = res_5_2;
1782
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 3] = res_5_3;
1783
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 4] = res_5_4;
1784
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 5] = res_5_5;
1785
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 6] = res_5_6;
1786
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 7] = res_5_7;
1787
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 0] = res_6_0;
1788
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 1] = res_6_1;
1789
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 2] = res_6_2;
1790
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 3] = res_6_3;
1791
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 4] = res_6_4;
1792
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 5] = res_6_5;
1793
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 6] = res_6_6;
1794
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 7] = res_6_7;
1795
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 0] = res_7_0;
1796
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 1] = res_7_1;
1797
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 2] = res_7_2;
1798
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 3] = res_7_3;
1799
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 4] = res_7_4;
1800
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 5] = res_7_5;
1801
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 6] = res_7_6;
1802
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 7] = res_7_7;
1806
__kernel void hst_backproject_8ppt_vliw_linear(const int num_proj,
1810
__read_only image2d_t sinogram,
1811
__global float *slice,
1812
__constant float4 *c_all,
1813
__local float *share
1815
__local float2 *cache = (__local float2*)share;
1816
const int tidx = get_local_id(0);
1817
const int tidy = get_local_id(1);
1819
const int bidx = PPT * get_group_id(0) * get_local_size(0);
1820
const int bidy = PPT * (get_global_id(1) - get_local_id(1));
1822
const float bx = bidx + off_x;
1823
const float by = bidy + off_y;
1825
const int sidx = tidx;
1826
const int sidy = tidy;
1828
const int idx = bidx + sidx;
1829
const int idy = bidy + sidy;
1831
const float x = idx + off_x;
1832
const float y = idy + off_y;
1834
const int slice_width = PPT * get_global_size(0);
1836
float res[PPT][PPT] = {0};
1837
for (int proj = 0; proj < num_proj; proj++) {
1838
float4 all = c_all[proj]; all.y = -all.y;
1839
float minh = floor(all.z + bx * all.x + by * all.y + all.w);
1842
for (int i = 0; i < 2; i++) {
1843
int pos = i * get_local_size(0) * get_local_size(0) + tidy * get_local_size(0) + tidx;
1844
cache[pos].x = read_imagef(sinogram, volumeSampler, (float2)(minh + pos, proj + .5f)).x;
1847
barrier(CLK_LOCAL_MEM_FENCE);
1849
for (int i = 0; i < 2; i++) {
1850
int pos = i * get_local_size(0) * get_local_size(0) + tidy * get_local_size(0) + tidx;
1851
cache[pos].y = cache[pos + 1].x - cache[pos].x;
1854
barrier(CLK_LOCAL_MEM_FENCE);
1856
float h = mad(x, all.x, mad(y, all.y, all.z)) - minh;
1861
for (int i = 0; i < PPT; i++) {
1863
for (int j = 0; j < PPT; j++) {
1864
subh = h + (i * get_local_size(0)) * all.y + (j * get_local_size(0)) * all.x;
1865
isubh = floor(subh);
1866
v = subh_cache(isubh);
1867
res[i][j] += v.x + (subh - isubh)*v.y;
1871
barrier(CLK_LOCAL_MEM_FENCE);
1875
for (int i = 0; i < PPT; i++) {
1877
for (int j = 0; j < PPT; j++) {
1878
slice[(idy + get_local_size(1) * i) * slice_width + idx + get_local_size(0) * j] = res[i][j];
1885
__kernel void hst_backproject_8ppt_vliw_linear(const int num_proj,
1889
__read_only image2d_t sinogram,
1890
__global float *slice,
1891
__constant float4 *c_all,
1892
__local float *share
1894
__local float2 *cache = (__local float2*)share;
1895
const int tidx = get_local_id(0);
1896
const int tidy = get_local_id(1);
1898
const int bidx = PPT * get_group_id(0) * get_local_size(0);
1899
const int bidy = PPT * (get_global_id(1) - get_local_id(1));
1901
const float bx = bidx + off_x;
1902
const float by = bidy + off_y;
1904
const int sidx = tidx;
1905
const int sidy = tidy;
1907
const int idx = bidx + sidx;
1908
const int idy = bidy + sidy;
1910
const float x = idx + off_x;
1911
const float y = idy + off_y;
1913
const int slice_width = PPT * get_global_size(0);
1982
for (int proj = 0; proj < num_proj; proj++) {
1983
float4 all = c_all[proj]; all.y = -all.y;
1984
float minh = floor(all.z + bx * all.x + by * all.y + all.w);
1987
for (int i = 0; i < 2; i++) {
1988
int pos = i * get_local_size(0) * get_local_size(0) + tidy * get_local_size(0) + tidx;
1989
cache[pos].x = read_imagef(sinogram, volumeSampler, (float2)(minh + pos, proj + .5f)).x;
1992
barrier(CLK_LOCAL_MEM_FENCE);
1994
for (int i = 0; i < 2; i++) {
1995
int pos = i * get_local_size(0) * get_local_size(0) + tidy * get_local_size(0) + tidx;
1996
cache[pos].y = cache[pos + 1].x - cache[pos].x;
1999
barrier(CLK_LOCAL_MEM_FENCE);
2001
float h = mad(x, all.x, mad(y, all.y, all.z)) - minh;
2005
subh = h + (0 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_0 += v.x + (subh - isubh) * v.y;
2006
subh = h + (0 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_1 += v.x + (subh - isubh) * v.y;
2007
subh = h + (0 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_2 += v.x + (subh - isubh) * v.y;
2008
subh = h + (0 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_3 += v.x + (subh - isubh) * v.y;
2009
subh = h + (0 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_4 += v.x + (subh - isubh) * v.y;
2010
subh = h + (0 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_5 += v.x + (subh - isubh) * v.y;
2011
subh = h + (0 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_6 += v.x + (subh - isubh) * v.y;
2012
subh = h + (0 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_0_7 += v.x + (subh - isubh) * v.y;
2013
subh = h + (1 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_0 += v.x + (subh - isubh) * v.y;
2014
subh = h + (1 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_1 += v.x + (subh - isubh) * v.y;
2015
subh = h + (1 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_2 += v.x + (subh - isubh) * v.y;
2016
subh = h + (1 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_3 += v.x + (subh - isubh) * v.y;
2017
subh = h + (1 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_4 += v.x + (subh - isubh) * v.y;
2018
subh = h + (1 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_5 += v.x + (subh - isubh) * v.y;
2019
subh = h + (1 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_6 += v.x + (subh - isubh) * v.y;
2020
subh = h + (1 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_1_7 += v.x + (subh - isubh) * v.y;
2021
subh = h + (2 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_0 += v.x + (subh - isubh) * v.y;
2022
subh = h + (2 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_1 += v.x + (subh - isubh) * v.y;
2023
subh = h + (2 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_2 += v.x + (subh - isubh) * v.y;
2024
subh = h + (2 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_3 += v.x + (subh - isubh) * v.y;
2025
subh = h + (2 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_4 += v.x + (subh - isubh) * v.y;
2026
subh = h + (2 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_5 += v.x + (subh - isubh) * v.y;
2027
subh = h + (2 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_6 += v.x + (subh - isubh) * v.y;
2028
subh = h + (2 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_2_7 += v.x + (subh - isubh) * v.y;
2029
subh = h + (3 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_0 += v.x + (subh - isubh) * v.y;
2030
subh = h + (3 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_1 += v.x + (subh - isubh) * v.y;
2031
subh = h + (3 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_2 += v.x + (subh - isubh) * v.y;
2032
subh = h + (3 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_3 += v.x + (subh - isubh) * v.y;
2033
subh = h + (3 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_4 += v.x + (subh - isubh) * v.y;
2034
subh = h + (3 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_5 += v.x + (subh - isubh) * v.y;
2035
subh = h + (3 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_6 += v.x + (subh - isubh) * v.y;
2036
subh = h + (3 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_3_7 += v.x + (subh - isubh) * v.y;
2037
subh = h + (4 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_0 += v.x + (subh - isubh) * v.y;
2038
subh = h + (4 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_1 += v.x + (subh - isubh) * v.y;
2039
subh = h + (4 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_2 += v.x + (subh - isubh) * v.y;
2040
subh = h + (4 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_3 += v.x + (subh - isubh) * v.y;
2041
subh = h + (4 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_4 += v.x + (subh - isubh) * v.y;
2042
subh = h + (4 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_5 += v.x + (subh - isubh) * v.y;
2043
subh = h + (4 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_6 += v.x + (subh - isubh) * v.y;
2044
subh = h + (4 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_4_7 += v.x + (subh - isubh) * v.y;
2045
subh = h + (5 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_0 += v.x + (subh - isubh) * v.y;
2046
subh = h + (5 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_1 += v.x + (subh - isubh) * v.y;
2047
subh = h + (5 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_2 += v.x + (subh - isubh) * v.y;
2048
subh = h + (5 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_3 += v.x + (subh - isubh) * v.y;
2049
subh = h + (5 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_4 += v.x + (subh - isubh) * v.y;
2050
subh = h + (5 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_5 += v.x + (subh - isubh) * v.y;
2051
subh = h + (5 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_6 += v.x + (subh - isubh) * v.y;
2052
subh = h + (5 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_5_7 += v.x + (subh - isubh) * v.y;
2053
subh = h + (6 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_0 += v.x + (subh - isubh) * v.y;
2054
subh = h + (6 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_1 += v.x + (subh - isubh) * v.y;
2055
subh = h + (6 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_2 += v.x + (subh - isubh) * v.y;
2056
subh = h + (6 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_3 += v.x + (subh - isubh) * v.y;
2057
subh = h + (6 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_4 += v.x + (subh - isubh) * v.y;
2058
subh = h + (6 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_5 += v.x + (subh - isubh) * v.y;
2059
subh = h + (6 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_6 += v.x + (subh - isubh) * v.y;
2060
subh = h + (6 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_6_7 += v.x + (subh - isubh) * v.y;
2061
subh = h + (7 * get_local_size(0)) * all.y + (0 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_0 += v.x + (subh - isubh) * v.y;
2062
subh = h + (7 * get_local_size(0)) * all.y + (1 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_1 += v.x + (subh - isubh) * v.y;
2063
subh = h + (7 * get_local_size(0)) * all.y + (2 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_2 += v.x + (subh - isubh) * v.y;
2064
subh = h + (7 * get_local_size(0)) * all.y + (3 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_3 += v.x + (subh - isubh) * v.y;
2065
subh = h + (7 * get_local_size(0)) * all.y + (4 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_4 += v.x + (subh - isubh) * v.y;
2066
subh = h + (7 * get_local_size(0)) * all.y + (5 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_5 += v.x + (subh - isubh) * v.y;
2067
subh = h + (7 * get_local_size(0)) * all.y + (6 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_6 += v.x + (subh - isubh) * v.y;
2068
subh = h + (7 * get_local_size(0)) * all.y + (7 * get_local_size(0)) * all.x; isubh = floor(subh); v = subh_cache(isubh); res_7_7 += v.x + (subh - isubh) * v.y;
2070
barrier(CLK_LOCAL_MEM_FENCE);
2073
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 0] = res_0_0;
2074
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 1] = res_0_1;
2075
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 2] = res_0_2;
2076
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 3] = res_0_3;
2077
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 4] = res_0_4;
2078
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 5] = res_0_5;
2079
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 6] = res_0_6;
2080
slice[(idy + get_local_size(1) * 0) * slice_width + idx + get_local_size(0) * 7] = res_0_7;
2081
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 0] = res_1_0;
2082
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 1] = res_1_1;
2083
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 2] = res_1_2;
2084
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 3] = res_1_3;
2085
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 4] = res_1_4;
2086
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 5] = res_1_5;
2087
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 6] = res_1_6;
2088
slice[(idy + get_local_size(1) * 1) * slice_width + idx + get_local_size(0) * 7] = res_1_7;
2089
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 0] = res_2_0;
2090
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 1] = res_2_1;
2091
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 2] = res_2_2;
2092
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 3] = res_2_3;
2093
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 4] = res_2_4;
2094
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 5] = res_2_5;
2095
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 6] = res_2_6;
2096
slice[(idy + get_local_size(1) * 2) * slice_width + idx + get_local_size(0) * 7] = res_2_7;
2097
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 0] = res_3_0;
2098
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 1] = res_3_1;
2099
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 2] = res_3_2;
2100
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 3] = res_3_3;
2101
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 4] = res_3_4;
2102
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 5] = res_3_5;
2103
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 6] = res_3_6;
2104
slice[(idy + get_local_size(1) * 3) * slice_width + idx + get_local_size(0) * 7] = res_3_7;
2105
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 0] = res_4_0;
2106
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 1] = res_4_1;
2107
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 2] = res_4_2;
2108
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 3] = res_4_3;
2109
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 4] = res_4_4;
2110
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 5] = res_4_5;
2111
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 6] = res_4_6;
2112
slice[(idy + get_local_size(1) * 4) * slice_width + idx + get_local_size(0) * 7] = res_4_7;
2113
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 0] = res_5_0;
2114
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 1] = res_5_1;
2115
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 2] = res_5_2;
2116
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 3] = res_5_3;
2117
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 4] = res_5_4;
2118
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 5] = res_5_5;
2119
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 6] = res_5_6;
2120
slice[(idy + get_local_size(1) * 5) * slice_width + idx + get_local_size(0) * 7] = res_5_7;
2121
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 0] = res_6_0;
2122
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 1] = res_6_1;
2123
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 2] = res_6_2;
2124
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 3] = res_6_3;
2125
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 4] = res_6_4;
2126
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 5] = res_6_5;
2127
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 6] = res_6_6;
2128
slice[(idy + get_local_size(1) * 6) * slice_width + idx + get_local_size(0) * 7] = res_6_7;
2129
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 0] = res_7_0;
2130
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 1] = res_7_1;
2131
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 2] = res_7_2;
2132
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 3] = res_7_3;
2133
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 4] = res_7_4;
2134
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 5] = res_7_5;
2135
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 6] = res_7_6;
2136
slice[(idy + get_local_size(1) * 7) * slice_width + idx + get_local_size(0) * 7] = res_7_7;
2140
__kernel void hst_backproject_6ppt_vliw_tex(const int num_proj,
2144
__read_only image2d_t sinogram,
2145
__global float *slice,
2146
__constant float4 *c_all/*,
2147
__local float *shared*/
2150
const int tidx = get_local_id(0);
2151
const int tidy = get_local_id(1);
2153
const int sidx = tidx;
2154
const int sidy = tidy;
2156
const int idx = PPT * get_group_id(0) * get_local_size(0) + sidx;
2157
const int idy = PPT * (get_global_id(1) - get_local_id(1)) + sidy;
2159
const float x = idx + off_x;
2160
const float y = idy + off_y;
2162
const int slice_width = PPT * get_global_size(0);
2201
for (int proj = 0; proj < num_proj; proj++) {
2202
float4 all = c_all[proj];
2204
float subh = mad(x, all.x, mad(y, all.y, all.z));
2206
res_0_0 += read_imagef(sinogram, volumeSampler, (float2)(subh + 0 * BLOCK_SIZE * all.y + 0 * BLOCK_SIZE * all.x, proj + .5f)).x;
2207
res_0_1 += read_imagef(sinogram, volumeSampler, (float2)(subh + 0 * BLOCK_SIZE * all.y + 1 * BLOCK_SIZE * all.x, proj + .5f)).x;
2208
res_0_2 += read_imagef(sinogram, volumeSampler, (float2)(subh + 0 * BLOCK_SIZE * all.y + 2 * BLOCK_SIZE * all.x, proj + .5f)).x;
2209
res_0_3 += read_imagef(sinogram, volumeSampler, (float2)(subh + 0 * BLOCK_SIZE * all.y + 3 * BLOCK_SIZE * all.x, proj + .5f)).x;
2210
res_0_4 += read_imagef(sinogram, volumeSampler, (float2)(subh + 0 * BLOCK_SIZE * all.y + 4 * BLOCK_SIZE * all.x, proj + .5f)).x;
2211
res_0_5 += read_imagef(sinogram, volumeSampler, (float2)(subh + 0 * BLOCK_SIZE * all.y + 5 * BLOCK_SIZE * all.x, proj + .5f)).x;
2212
res_1_0 += read_imagef(sinogram, volumeSampler, (float2)(subh + 1 * BLOCK_SIZE * all.y + 0 * BLOCK_SIZE * all.x, proj + .5f)).x;
2213
res_1_1 += read_imagef(sinogram, volumeSampler, (float2)(subh + 1 * BLOCK_SIZE * all.y + 1 * BLOCK_SIZE * all.x, proj + .5f)).x;
2214
res_1_2 += read_imagef(sinogram, volumeSampler, (float2)(subh + 1 * BLOCK_SIZE * all.y + 2 * BLOCK_SIZE * all.x, proj + .5f)).x;
2215
res_1_3 += read_imagef(sinogram, volumeSampler, (float2)(subh + 1 * BLOCK_SIZE * all.y + 3 * BLOCK_SIZE * all.x, proj + .5f)).x;
2216
res_1_4 += read_imagef(sinogram, volumeSampler, (float2)(subh + 1 * BLOCK_SIZE * all.y + 4 * BLOCK_SIZE * all.x, proj + .5f)).x;
2217
res_1_5 += read_imagef(sinogram, volumeSampler, (float2)(subh + 1 * BLOCK_SIZE * all.y + 5 * BLOCK_SIZE * all.x, proj + .5f)).x;
2218
res_2_0 += read_imagef(sinogram, volumeSampler, (float2)(subh + 2 * BLOCK_SIZE * all.y + 0 * BLOCK_SIZE * all.x, proj + .5f)).x;
2219
res_2_1 += read_imagef(sinogram, volumeSampler, (float2)(subh + 2 * BLOCK_SIZE * all.y + 1 * BLOCK_SIZE * all.x, proj + .5f)).x;
2220
res_2_2 += read_imagef(sinogram, volumeSampler, (float2)(subh + 2 * BLOCK_SIZE * all.y + 2 * BLOCK_SIZE * all.x, proj + .5f)).x;
2221
res_2_3 += read_imagef(sinogram, volumeSampler, (float2)(subh + 2 * BLOCK_SIZE * all.y + 3 * BLOCK_SIZE * all.x, proj + .5f)).x;
2222
res_2_4 += read_imagef(sinogram, volumeSampler, (float2)(subh + 2 * BLOCK_SIZE * all.y + 4 * BLOCK_SIZE * all.x, proj + .5f)).x;
2223
res_2_5 += read_imagef(sinogram, volumeSampler, (float2)(subh + 2 * BLOCK_SIZE * all.y + 5 * BLOCK_SIZE * all.x, proj + .5f)).x;
2224
res_3_0 += read_imagef(sinogram, volumeSampler, (float2)(subh + 3 * BLOCK_SIZE * all.y + 0 * BLOCK_SIZE * all.x, proj + .5f)).x;
2225
res_3_1 += read_imagef(sinogram, volumeSampler, (float2)(subh + 3 * BLOCK_SIZE * all.y + 1 * BLOCK_SIZE * all.x, proj + .5f)).x;
2226
res_3_2 += read_imagef(sinogram, volumeSampler, (float2)(subh + 3 * BLOCK_SIZE * all.y + 2 * BLOCK_SIZE * all.x, proj + .5f)).x;
2227
res_3_3 += read_imagef(sinogram, volumeSampler, (float2)(subh + 3 * BLOCK_SIZE * all.y + 3 * BLOCK_SIZE * all.x, proj + .5f)).x;
2228
res_3_4 += read_imagef(sinogram, volumeSampler, (float2)(subh + 3 * BLOCK_SIZE * all.y + 4 * BLOCK_SIZE * all.x, proj + .5f)).x;
2229
res_3_5 += read_imagef(sinogram, volumeSampler, (float2)(subh + 3 * BLOCK_SIZE * all.y + 5 * BLOCK_SIZE * all.x, proj + .5f)).x;
2230
res_4_0 += read_imagef(sinogram, volumeSampler, (float2)(subh + 4 * BLOCK_SIZE * all.y + 0 * BLOCK_SIZE * all.x, proj + .5f)).x;
2231
res_4_1 += read_imagef(sinogram, volumeSampler, (float2)(subh + 4 * BLOCK_SIZE * all.y + 1 * BLOCK_SIZE * all.x, proj + .5f)).x;
2232
res_4_2 += read_imagef(sinogram, volumeSampler, (float2)(subh + 4 * BLOCK_SIZE * all.y + 2 * BLOCK_SIZE * all.x, proj + .5f)).x;
2233
res_4_3 += read_imagef(sinogram, volumeSampler, (float2)(subh + 4 * BLOCK_SIZE * all.y + 3 * BLOCK_SIZE * all.x, proj + .5f)).x;
2234
res_4_4 += read_imagef(sinogram, volumeSampler, (float2)(subh + 4 * BLOCK_SIZE * all.y + 4 * BLOCK_SIZE * all.x, proj + .5f)).x;
2235
res_4_5 += read_imagef(sinogram, volumeSampler, (float2)(subh + 4 * BLOCK_SIZE * all.y + 5 * BLOCK_SIZE * all.x, proj + .5f)).x;
2236
res_5_0 += read_imagef(sinogram, volumeSampler, (float2)(subh + 5 * BLOCK_SIZE * all.y + 0 * BLOCK_SIZE * all.x, proj + .5f)).x;
2237
res_5_1 += read_imagef(sinogram, volumeSampler, (float2)(subh + 5 * BLOCK_SIZE * all.y + 1 * BLOCK_SIZE * all.x, proj + .5f)).x;
2238
res_5_2 += read_imagef(sinogram, volumeSampler, (float2)(subh + 5 * BLOCK_SIZE * all.y + 2 * BLOCK_SIZE * all.x, proj + .5f)).x;
2239
res_5_3 += read_imagef(sinogram, volumeSampler, (float2)(subh + 5 * BLOCK_SIZE * all.y + 3 * BLOCK_SIZE * all.x, proj + .5f)).x;
2240
res_5_4 += read_imagef(sinogram, volumeSampler, (float2)(subh + 5 * BLOCK_SIZE * all.y + 4 * BLOCK_SIZE * all.x, proj + .5f)).x;
2241
res_5_5 += read_imagef(sinogram, volumeSampler, (float2)(subh + 5 * BLOCK_SIZE * all.y + 5 * BLOCK_SIZE * all.x, proj + .5f)).x;
2246
slice[(idy + BLOCK_SIZE * 0) * slice_width + idx + BLOCK_SIZE * 0] = res_0_0;
2247
slice[(idy + BLOCK_SIZE * 0) * slice_width + idx + BLOCK_SIZE * 1] = res_0_1;
2248
slice[(idy + BLOCK_SIZE * 0) * slice_width + idx + BLOCK_SIZE * 2] = res_0_2;
2249
slice[(idy + BLOCK_SIZE * 0) * slice_width + idx + BLOCK_SIZE * 3] = res_0_3;
2250
slice[(idy + BLOCK_SIZE * 0) * slice_width + idx + BLOCK_SIZE * 4] = res_0_4;
2251
slice[(idy + BLOCK_SIZE * 0) * slice_width + idx + BLOCK_SIZE * 5] = res_0_5;
2252
slice[(idy + BLOCK_SIZE * 1) * slice_width + idx + BLOCK_SIZE * 0] = res_1_0;
2253
slice[(idy + BLOCK_SIZE * 1) * slice_width + idx + BLOCK_SIZE * 1] = res_1_1;
2254
slice[(idy + BLOCK_SIZE * 1) * slice_width + idx + BLOCK_SIZE * 2] = res_1_2;
2255
slice[(idy + BLOCK_SIZE * 1) * slice_width + idx + BLOCK_SIZE * 3] = res_1_3;
2256
slice[(idy + BLOCK_SIZE * 1) * slice_width + idx + BLOCK_SIZE * 4] = res_1_4;
2257
slice[(idy + BLOCK_SIZE * 1) * slice_width + idx + BLOCK_SIZE * 5] = res_1_5;
2258
slice[(idy + BLOCK_SIZE * 2) * slice_width + idx + BLOCK_SIZE * 0] = res_2_0;
2259
slice[(idy + BLOCK_SIZE * 2) * slice_width + idx + BLOCK_SIZE * 1] = res_2_1;
2260
slice[(idy + BLOCK_SIZE * 2) * slice_width + idx + BLOCK_SIZE * 2] = res_2_2;
2261
slice[(idy + BLOCK_SIZE * 2) * slice_width + idx + BLOCK_SIZE * 3] = res_2_3;
2262
slice[(idy + BLOCK_SIZE * 2) * slice_width + idx + BLOCK_SIZE * 4] = res_2_4;
2263
slice[(idy + BLOCK_SIZE * 2) * slice_width + idx + BLOCK_SIZE * 5] = res_2_5;
2264
slice[(idy + BLOCK_SIZE * 3) * slice_width + idx + BLOCK_SIZE * 0] = res_3_0;
2265
slice[(idy + BLOCK_SIZE * 3) * slice_width + idx + BLOCK_SIZE * 1] = res_3_1;
2266
slice[(idy + BLOCK_SIZE * 3) * slice_width + idx + BLOCK_SIZE * 2] = res_3_2;
2267
slice[(idy + BLOCK_SIZE * 3) * slice_width + idx + BLOCK_SIZE * 3] = res_3_3;
2268
slice[(idy + BLOCK_SIZE * 3) * slice_width + idx + BLOCK_SIZE * 4] = res_3_4;
2269
slice[(idy + BLOCK_SIZE * 3) * slice_width + idx + BLOCK_SIZE * 5] = res_3_5;
2270
slice[(idy + BLOCK_SIZE * 4) * slice_width + idx + BLOCK_SIZE * 0] = res_4_0;
2271
slice[(idy + BLOCK_SIZE * 4) * slice_width + idx + BLOCK_SIZE * 1] = res_4_1;
2272
slice[(idy + BLOCK_SIZE * 4) * slice_width + idx + BLOCK_SIZE * 2] = res_4_2;
2273
slice[(idy + BLOCK_SIZE * 4) * slice_width + idx + BLOCK_SIZE * 3] = res_4_3;
2274
slice[(idy + BLOCK_SIZE * 4) * slice_width + idx + BLOCK_SIZE * 4] = res_4_4;
2275
slice[(idy + BLOCK_SIZE * 4) * slice_width + idx + BLOCK_SIZE * 5] = res_4_5;
2276
slice[(idy + BLOCK_SIZE * 5) * slice_width + idx + BLOCK_SIZE * 0] = res_5_0;
2277
slice[(idy + BLOCK_SIZE * 5) * slice_width + idx + BLOCK_SIZE * 1] = res_5_1;
2278
slice[(idy + BLOCK_SIZE * 5) * slice_width + idx + BLOCK_SIZE * 2] = res_5_2;
2279
slice[(idy + BLOCK_SIZE * 5) * slice_width + idx + BLOCK_SIZE * 3] = res_5_3;
2280
slice[(idy + BLOCK_SIZE * 5) * slice_width + idx + BLOCK_SIZE * 4] = res_5_4;
2281
slice[(idy + BLOCK_SIZE * 5) * slice_width + idx + BLOCK_SIZE * 5] = res_5_5;