/opencl/oclfft

To get this branch, use:
bzr branch http://darksoft.org/webbzr/opencl/oclfft

« back to all changes in this revision

Viewing changes to fft_execute.cpp

  • Committer: Matthias Vogelgesang
  • Date: 2011-01-31 09:18:47 UTC
  • Revision ID: git-v1:418c612a670678194837191e7c580047d8d58c28
Initial commit

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
 
 
2
//
 
3
// File:       fft_execute.cpp
 
4
//
 
5
// Version:    <1.0>
 
6
//
 
7
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
 
8
//             in consideration of your agreement to the following terms, and your use,
 
9
//             installation, modification or redistribution of this Apple software
 
10
//             constitutes acceptance of these terms.  If you do not agree with these
 
11
//             terms, please do not use, install, modify or redistribute this Apple
 
12
//             software.¬
 
13
//
 
14
//             In consideration of your agreement to abide by the following terms, and
 
15
//             subject to these terms, Apple grants you a personal, non - exclusive
 
16
//             license, under Apple's copyrights in this original Apple software ( the
 
17
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
 
18
//             Software, with or without modifications, in source and / or binary forms;
 
19
//             provided that if you redistribute the Apple Software in its entirety and
 
20
//             without modifications, you must retain this notice and the following text
 
21
//             and disclaimers in all such redistributions of the Apple Software. Neither
 
22
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
 
23
//             endorse or promote products derived from the Apple Software without specific
 
24
//             prior written permission from Apple.  Except as expressly stated in this
 
25
//             notice, no other rights or licenses, express or implied, are granted by
 
26
//             Apple herein, including but not limited to any patent rights that may be
 
27
//             infringed by your derivative works or by other works in which the Apple
 
28
//             Software may be incorporated.
 
29
//
 
30
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
 
31
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
 
32
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
 
33
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
 
34
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
 
35
//
 
36
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
 
37
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 
38
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 
39
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
 
40
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
 
41
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
 
42
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
43
//
 
44
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
 
45
//
 
46
////////////////////////////////////////////////////////////////////////////////////////////////////
 
47
 
 
48
 
 
49
#include "fft_internal.h"
 
50
#include "clFFT.h"
 
51
#include <stdlib.h>
 
52
#include <stdio.h>
 
53
#include <math.h>
 
54
 
 
55
#define max(a,b) (((a)>(b)) ? (a) : (b))
 
56
#define min(a,b) (((a)<(b)) ? (a) : (b))
 
57
 
 
58
static cl_int
 
59
allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
 
60
{
 
61
        cl_int err = CL_SUCCESS;
 
62
        if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) 
 
63
        {
 
64
                plan->last_batch_size = batchSize; 
 
65
                size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
 
66
                
 
67
                if(plan->tempmemobj)
 
68
                        clReleaseMemObject(plan->tempmemobj);
 
69
                        
 
70
                plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
 
71
        }
 
72
        return err;     
 
73
}
 
74
 
 
75
static cl_int
 
76
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
 
77
{
 
78
        cl_int err = CL_SUCCESS;
 
79
        cl_int terr;
 
80
        if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) 
 
81
        {
 
82
                plan->last_batch_size = batchSize; 
 
83
                size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
 
84
                
 
85
                if(plan->tempmemobj_real)
 
86
                        clReleaseMemObject(plan->tempmemobj_real);
 
87
 
 
88
                if(plan->tempmemobj_imag)
 
89
                        clReleaseMemObject(plan->tempmemobj_imag);                      
 
90
                        
 
91
                plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
 
92
                plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
 
93
                err |= terr;
 
94
        }       
 
95
        return err;
 
96
}
 
97
 
 
98
void
 
99
getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
 
100
{
 
101
        *lWorkItems = kernelInfo->num_workitems_per_workgroup;
 
102
        int numWorkGroups = kernelInfo->num_workgroups;
 
103
    int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
 
104
        
 
105
        switch(kernelInfo->dir)
 
106
        {
 
107
                case cl_fft_kernel_x:
 
108
            *batchSize *= (plan->n.y * plan->n.z);
 
109
            numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
 
110
            numWorkGroups *= kernelInfo->num_workgroups;
 
111
                        break;
 
112
                case cl_fft_kernel_y:
 
113
                        *batchSize *= plan->n.z;
 
114
                        numWorkGroups *= *batchSize;
 
115
                        break;
 
116
                case cl_fft_kernel_z:
 
117
                        numWorkGroups *= *batchSize;
 
118
                        break;
 
119
        }
 
120
        
 
121
        *gWorkItems = numWorkGroups * *lWorkItems;
 
122
}
 
123
 
 
124
cl_int 
 
125
clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, 
 
126
                                                 cl_mem data_in, cl_mem data_out, 
 
127
                                                 cl_int num_events, cl_event *event_list, cl_event *event )
 
128
{       
 
129
        int s;
 
130
        cl_fft_plan *plan = (cl_fft_plan *) Plan;
 
131
        if(plan->format != clFFT_InterleavedComplexFormat)
 
132
                return CL_INVALID_VALUE;
 
133
        
 
134
        cl_int err;
 
135
        size_t gWorkItems, lWorkItems;
 
136
        int inPlaceDone;
 
137
        
 
138
        cl_int isInPlace = data_in == data_out ? 1 : 0;
 
139
        
 
140
        if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
 
141
                return err;     
 
142
        
 
143
        cl_mem memObj[3];
 
144
        memObj[0] = data_in;
 
145
        memObj[1] = data_out;
 
146
        memObj[2] = plan->tempmemobj;
 
147
        cl_fft_kernel_info *kernelInfo = plan->kernel_info;
 
148
        int numKernels = plan->num_kernels;
 
149
        
 
150
        int numKernelsOdd = numKernels & 1;
 
151
        int currRead  = 0;
 
152
        int currWrite = 1;
 
153
        
 
154
        // at least one external dram shuffle (transpose) required
 
155
        if(plan->temp_buffer_needed) 
 
156
        {
 
157
                // in-place transform
 
158
                if(isInPlace) 
 
159
                {
 
160
                        inPlaceDone = 0;
 
161
                        currRead  = 1;
 
162
                        currWrite = 2;
 
163
                }
 
164
                else
 
165
                {
 
166
                        currWrite = (numKernels & 1) ? 1 : 2;
 
167
                }
 
168
                
 
169
                while(kernelInfo) 
 
170
                {
 
171
                        if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) 
 
172
                        {
 
173
                                currWrite = currRead;
 
174
                                inPlaceDone = 1;
 
175
                        }
 
176
                        
 
177
                        s = batchSize;
 
178
                        getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
 
179
                        err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
 
180
                        err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
 
181
                        err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
 
182
                        err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
 
183
                        
 
184
                        err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
 
185
                        if(err)
 
186
                                return err;
 
187
                        
 
188
                        currRead  = (currWrite == 1) ? 1 : 2;
 
189
                        currWrite = (currWrite == 1) ? 2 : 1; 
 
190
                        
 
191
                        kernelInfo = kernelInfo->next;
 
192
                }                       
 
193
        }
 
194
        // no dram shuffle (transpose required) transform
 
195
        // all kernels can execute in-place.
 
196
        else {
 
197
                
 
198
                while(kernelInfo)
 
199
                {
 
200
                    s = batchSize;
 
201
                    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
 
202
                    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
 
203
                    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
 
204
                    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
 
205
                    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
 
206
                
 
207
                    err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
 
208
                    if(err)
 
209
                            return err;         
 
210
                        
 
211
                        currRead  = 1;
 
212
                        currWrite = 1;
 
213
                        
 
214
                        kernelInfo = kernelInfo->next;
 
215
                }
 
216
        }
 
217
        
 
218
        return err;
 
219
}
 
220
 
 
221
cl_int 
 
222
clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, 
 
223
                                          cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
 
224
                                          cl_int num_events, cl_event *event_list, cl_event *event)
 
225
{       
 
226
        int s;
 
227
        cl_fft_plan *plan = (cl_fft_plan *) Plan;
 
228
        
 
229
        if(plan->format != clFFT_SplitComplexFormat)
 
230
                return CL_INVALID_VALUE;
 
231
        
 
232
        cl_int err;
 
233
        size_t gWorkItems, lWorkItems;
 
234
        int inPlaceDone;
 
235
        
 
236
        cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
 
237
        
 
238
        if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
 
239
                return err;     
 
240
        
 
241
        cl_mem memObj_real[3];
 
242
        cl_mem memObj_imag[3];
 
243
        memObj_real[0] = data_in_real;
 
244
        memObj_real[1] = data_out_real;
 
245
        memObj_real[2] = plan->tempmemobj_real;
 
246
        memObj_imag[0] = data_in_imag;
 
247
        memObj_imag[1] = data_out_imag;
 
248
        memObj_imag[2] = plan->tempmemobj_imag;
 
249
                
 
250
        cl_fft_kernel_info *kernelInfo = plan->kernel_info;
 
251
        int numKernels = plan->num_kernels;
 
252
        
 
253
        int numKernelsOdd = numKernels & 1;
 
254
        int currRead  = 0;
 
255
        int currWrite = 1;
 
256
        
 
257
        // at least one external dram shuffle (transpose) required
 
258
        if(plan->temp_buffer_needed) 
 
259
        {
 
260
                // in-place transform
 
261
                if(isInPlace) 
 
262
                {
 
263
                        inPlaceDone = 0;
 
264
                        currRead  = 1;
 
265
                        currWrite = 2;
 
266
                }
 
267
                else
 
268
                {
 
269
                        currWrite = (numKernels & 1) ? 1 : 2;
 
270
                }
 
271
                
 
272
                while(kernelInfo) 
 
273
                {
 
274
                        if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) 
 
275
                        {
 
276
                                currWrite = currRead;
 
277
                                inPlaceDone = 1;
 
278
                        }
 
279
                        
 
280
                        s = batchSize;
 
281
                        getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
 
282
                        err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
 
283
                        err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
 
284
                        err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
 
285
                        err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
 
286
                        err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
 
287
                        err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
 
288
                        
 
289
                        err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
 
290
                        if(err)
 
291
                                return err;                     
 
292
                        
 
293
                        currRead  = (currWrite == 1) ? 1 : 2;
 
294
                        currWrite = (currWrite == 1) ? 2 : 1; 
 
295
                        
 
296
                        kernelInfo = kernelInfo->next;
 
297
                }                       
 
298
        }
 
299
        // no dram shuffle (transpose required) transform
 
300
        else {
 
301
                
 
302
                while(kernelInfo)
 
303
                {
 
304
                    s = batchSize;
 
305
                    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
 
306
                    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
 
307
                    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
 
308
                    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
 
309
                    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
 
310
                    err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
 
311
                    err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
 
312
                
 
313
                    err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
 
314
                    if(err)
 
315
                            return err; 
 
316
                        
 
317
                        currRead  = 1;
 
318
                        currWrite = 1;
 
319
                
 
320
                        kernelInfo = kernelInfo->next;
 
321
                }
 
322
        }
 
323
        
 
324
        return err;
 
325
}
 
326
 
 
327
cl_int 
 
328
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, 
 
329
                                                 size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
 
330
{
 
331
        cl_fft_plan *plan = (cl_fft_plan *) Plan;
 
332
        
 
333
        unsigned int N = numRows*numCols;
 
334
        unsigned int nCols = numCols;
 
335
        unsigned int sRow = startRow;
 
336
        unsigned int rToProcess = rowsToProcess;
 
337
        int d = dir;
 
338
        int err = 0;
 
339
        
 
340
        cl_device_id device_id;
 
341
        err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
 
342
        if(err)
 
343
            return err;
 
344
        
 
345
        size_t gSize;
 
346
        err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
 
347
        if(err)
 
348
            return err;
 
349
              
 
350
        gSize = min(128, gSize);
 
351
        size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
 
352
        size_t numLocalThreads[1]  = { gSize };
 
353
        
 
354
        err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
 
355
        err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
 
356
        err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
 
357
        err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
 
358
        err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
 
359
        err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
 
360
        
 
361
        err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);            
 
362
        
 
363
        return err;     
 
364
}
 
365
 
 
366
cl_int 
 
367
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, 
 
368
                                         size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
 
369
{
 
370
        cl_fft_plan *plan = (cl_fft_plan *) Plan;
 
371
        
 
372
        unsigned int N = numRows*numCols;
 
373
        unsigned int nCols = numCols;
 
374
        unsigned int sRow = startRow;
 
375
        unsigned int rToProcess = rowsToProcess;
 
376
        int d = dir;
 
377
        int err = 0;
 
378
        
 
379
        cl_device_id device_id;
 
380
        err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
 
381
        if(err)
 
382
            return err;
 
383
        
 
384
        size_t gSize;
 
385
        err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
 
386
        if(err)
 
387
            return err;
 
388
              
 
389
        gSize = min(128, gSize);
 
390
        size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
 
391
        size_t numLocalThreads[1]  = { gSize };
 
392
        
 
393
        err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
 
394
        err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
 
395
        err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
 
396
        err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
 
397
        err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
 
398
        err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
 
399
        err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
 
400
        
 
401
        err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);            
 
402
        
 
403
        return err;     
 
404
}
 
405