3
// File: fft_execute.cpp
7
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
8
// in consideration of your agreement to the following terms, and your use,
9
// installation, modification or redistribution of this Apple software
10
// constitutes acceptance of these terms. If you do not agree with these
11
// terms, please do not use, install, modify or redistribute this Apple
14
// In consideration of your agreement to abide by the following terms, and
15
// subject to these terms, Apple grants you a personal, non - exclusive
16
// license, under Apple's copyrights in this original Apple software ( the
17
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
18
// Software, with or without modifications, in source and / or binary forms;
19
// provided that if you redistribute the Apple Software in its entirety and
20
// without modifications, you must retain this notice and the following text
21
// and disclaimers in all such redistributions of the Apple Software. Neither
22
// the name, trademarks, service marks or logos of Apple Inc. may be used to
23
// endorse or promote products derived from the Apple Software without specific
24
// prior written permission from Apple. Except as expressly stated in this
25
// notice, no other rights or licenses, express or implied, are granted by
26
// Apple herein, including but not limited to any patent rights that may be
27
// infringed by your derivative works or by other works in which the Apple
28
// Software may be incorporated.
30
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
31
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
32
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
33
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
34
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
36
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
37
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
40
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
41
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
42
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
46
////////////////////////////////////////////////////////////////////////////////////////////////////
49
#include "fft_internal.h"
55
#define max(a,b) (((a)>(b)) ? (a) : (b))
56
#define min(a,b) (((a)<(b)) ? (a) : (b))
59
allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
61
cl_int err = CL_SUCCESS;
62
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
64
plan->last_batch_size = batchSize;
65
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
68
clReleaseMemObject(plan->tempmemobj);
70
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
76
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
78
cl_int err = CL_SUCCESS;
80
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
82
plan->last_batch_size = batchSize;
83
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
85
if(plan->tempmemobj_real)
86
clReleaseMemObject(plan->tempmemobj_real);
88
if(plan->tempmemobj_imag)
89
clReleaseMemObject(plan->tempmemobj_imag);
91
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
92
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
99
getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
101
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
102
int numWorkGroups = kernelInfo->num_workgroups;
103
int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
105
switch(kernelInfo->dir)
107
case cl_fft_kernel_x:
108
*batchSize *= (plan->n.y * plan->n.z);
109
numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
110
numWorkGroups *= kernelInfo->num_workgroups;
112
case cl_fft_kernel_y:
113
*batchSize *= plan->n.z;
114
numWorkGroups *= *batchSize;
116
case cl_fft_kernel_z:
117
numWorkGroups *= *batchSize;
121
*gWorkItems = numWorkGroups * *lWorkItems;
125
clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
126
cl_mem data_in, cl_mem data_out,
127
cl_int num_events, cl_event *event_list, cl_event *event )
130
cl_fft_plan *plan = (cl_fft_plan *) Plan;
131
if(plan->format != clFFT_InterleavedComplexFormat)
132
return CL_INVALID_VALUE;
135
size_t gWorkItems, lWorkItems;
138
cl_int isInPlace = data_in == data_out ? 1 : 0;
140
if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
145
memObj[1] = data_out;
146
memObj[2] = plan->tempmemobj;
147
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
148
int numKernels = plan->num_kernels;
150
int numKernelsOdd = numKernels & 1;
154
// at least one external dram shuffle (transpose) required
155
if(plan->temp_buffer_needed)
157
// in-place transform
166
currWrite = (numKernels & 1) ? 1 : 2;
171
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
173
currWrite = currRead;
178
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
179
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
180
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
181
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
182
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
184
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
188
currRead = (currWrite == 1) ? 1 : 2;
189
currWrite = (currWrite == 1) ? 2 : 1;
191
kernelInfo = kernelInfo->next;
194
// no dram shuffle (transpose required) transform
195
// all kernels can execute in-place.
201
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
202
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
203
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
204
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
205
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
207
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
214
kernelInfo = kernelInfo->next;
222
clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
223
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
224
cl_int num_events, cl_event *event_list, cl_event *event)
227
cl_fft_plan *plan = (cl_fft_plan *) Plan;
229
if(plan->format != clFFT_SplitComplexFormat)
230
return CL_INVALID_VALUE;
233
size_t gWorkItems, lWorkItems;
236
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
238
if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
241
cl_mem memObj_real[3];
242
cl_mem memObj_imag[3];
243
memObj_real[0] = data_in_real;
244
memObj_real[1] = data_out_real;
245
memObj_real[2] = plan->tempmemobj_real;
246
memObj_imag[0] = data_in_imag;
247
memObj_imag[1] = data_out_imag;
248
memObj_imag[2] = plan->tempmemobj_imag;
250
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
251
int numKernels = plan->num_kernels;
253
int numKernelsOdd = numKernels & 1;
257
// at least one external dram shuffle (transpose) required
258
if(plan->temp_buffer_needed)
260
// in-place transform
269
currWrite = (numKernels & 1) ? 1 : 2;
274
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
276
currWrite = currRead;
281
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
282
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
283
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
284
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
285
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
286
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
287
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
289
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
293
currRead = (currWrite == 1) ? 1 : 2;
294
currWrite = (currWrite == 1) ? 2 : 1;
296
kernelInfo = kernelInfo->next;
299
// no dram shuffle (transpose required) transform
305
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
306
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
307
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
308
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
309
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
310
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
311
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
313
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
320
kernelInfo = kernelInfo->next;
328
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
329
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
331
cl_fft_plan *plan = (cl_fft_plan *) Plan;
333
unsigned int N = numRows*numCols;
334
unsigned int nCols = numCols;
335
unsigned int sRow = startRow;
336
unsigned int rToProcess = rowsToProcess;
340
cl_device_id device_id;
341
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
346
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
350
gSize = min(128, gSize);
351
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
352
size_t numLocalThreads[1] = { gSize };
354
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
355
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
356
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
357
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
358
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
359
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
361
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
367
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
368
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
370
cl_fft_plan *plan = (cl_fft_plan *) Plan;
372
unsigned int N = numRows*numCols;
373
unsigned int nCols = numCols;
374
unsigned int sRow = startRow;
375
unsigned int rToProcess = rowsToProcess;
379
cl_device_id device_id;
380
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
385
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
389
gSize = min(128, gSize);
390
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
391
size_t numLocalThreads[1] = { gSize };
393
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
394
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
395
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
396
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
397
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
398
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
399
err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
401
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);