bzr branch
http://darksoft.org/webbzr/opencl/oclfft
1
by Matthias Vogelgesang
Initial commit |
1 |
//
|
2 |
// File: fft_internal.h
|
|
3 |
//
|
|
4 |
// Version: <1.0>
|
|
5 |
//
|
|
6 |
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
|
|
7 |
// in consideration of your agreement to the following terms, and your use,
|
|
8 |
// installation, modification or redistribution of this Apple software
|
|
9 |
// constitutes acceptance of these terms. If you do not agree with these
|
|
10 |
// terms, please do not use, install, modify or redistribute this Apple
|
|
11 |
// software.
|
|
12 |
//
|
|
13 |
// In consideration of your agreement to abide by the following terms, and
|
|
14 |
// subject to these terms, Apple grants you a personal, non - exclusive
|
|
15 |
// license, under Apple's copyrights in this original Apple software ( the
|
|
16 |
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
|
|
17 |
// Software, with or without modifications, in source and / or binary forms;
|
|
18 |
// provided that if you redistribute the Apple Software in its entirety and
|
|
19 |
// without modifications, you must retain this notice and the following text
|
|
20 |
// and disclaimers in all such redistributions of the Apple Software. Neither
|
|
21 |
// the name, trademarks, service marks or logos of Apple Inc. may be used to
|
|
22 |
// endorse or promote products derived from the Apple Software without specific
|
|
23 |
// prior written permission from Apple. Except as expressly stated in this
|
|
24 |
// notice, no other rights or licenses, express or implied, are granted by
|
|
25 |
// Apple herein, including but not limited to any patent rights that may be
|
|
26 |
// infringed by your derivative works or by other works in which the Apple
|
|
27 |
// Software may be incorporated.
|
|
28 |
//
|
|
29 |
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
|
|
30 |
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
|
|
31 |
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
|
|
32 |
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
|
|
33 |
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
|
|
34 |
//
|
|
35 |
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
|
|
36 |
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
37 |
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
38 |
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
|
|
39 |
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
|
|
40 |
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
|
|
41 |
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
42 |
//
|
|
43 |
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
|
|
44 |
//
|
|
45 |
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
46 |
||
47 |
||
48 |
#ifndef __CLFFT_INTERNAL_H
|
|
49 |
#define __CLFFT_INTERNAL_H
|
|
50 |
||
51 |
#include "clFFT.h" |
|
52 |
#include <sstream> |
|
53 |
||
54 |
using namespace std; |
|
55 |
||
56 |
typedef enum kernel_dir_t |
|
57 |
{
|
|
58 |
cl_fft_kernel_x, |
|
59 |
cl_fft_kernel_y, |
|
60 |
cl_fft_kernel_z
|
|
61 |
}cl_fft_kernel_dir; |
|
62 |
||
63 |
typedef struct kernel_info_t |
|
64 |
{
|
|
65 |
cl_kernel kernel; |
|
66 |
char *kernel_name; |
|
67 |
size_t lmem_size; |
|
68 |
size_t num_workgroups; |
|
69 |
size_t num_xforms_per_workgroup; |
|
70 |
size_t num_workitems_per_workgroup; |
|
71 |
cl_fft_kernel_dir dir; |
|
72 |
int in_place_possible; |
|
73 |
kernel_info_t *next; |
|
74 |
}cl_fft_kernel_info; |
|
75 |
||
76 |
typedef struct |
|
77 |
{
|
|
78 |
// context in which fft resources are created and kernels are executed
|
|
79 |
cl_context context; |
|
80 |
||
81 |
// size of signal
|
|
82 |
clFFT_Dim3 n; |
|
83 |
||
84 |
// dimension of transform ... must be either 1D, 2D or 3D
|
|
85 |
clFFT_Dimension dim; |
|
86 |
||
87 |
// data format ... must be either interleaved or plannar
|
|
88 |
clFFT_DataFormat format; |
|
89 |
||
90 |
// string containing kernel source. Generated at runtime based on
|
|
91 |
// n, dim, format and other parameters
|
|
92 |
string *kernel_string; |
|
93 |
||
94 |
// CL program containing source and kernel this particular
|
|
95 |
// n, dim, data format
|
|
96 |
cl_program program; |
|
97 |
||
98 |
// linked list of kernels which needs to be executed for this fft
|
|
99 |
cl_fft_kernel_info *kernel_info; |
|
100 |
||
101 |
// number of kernels
|
|
102 |
int num_kernels; |
|
103 |
||
104 |
// twist kernel for virtualizing fft of very large sizes that do not
|
|
105 |
// fit in GPU global memory
|
|
106 |
cl_kernel twist_kernel; |
|
107 |
||
108 |
// flag indicating if temporary intermediate buffer is needed or not.
|
|
109 |
// this depends on fft kernels being executed and if transform is
|
|
110 |
// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
|
|
111 |
// one that does not require global transpose do not need temporary buffer)
|
|
112 |
// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
|
|
113 |
// If temp buffer is needed, its allocation is lazy i.e. its not allocated
|
|
114 |
// until its needed
|
|
115 |
cl_int temp_buffer_needed; |
|
116 |
||
117 |
// Batch size is runtime parameter and size of temporary buffer (if needed)
|
|
118 |
// depends on batch size. Allocation of temporary buffer is lazy i.e. its
|
|
119 |
// only created when needed. Once its created at first call of clFFT_Executexxx
|
|
120 |
// it is not allocated next time if next time clFFT_Executexxx is called with
|
|
121 |
// batch size different than the first call. last_batch_size caches the last
|
|
122 |
// batch size with which this plan is used so that we dont keep allocating/deallocating
|
|
123 |
// temp buffer if same batch size is used again and again.
|
|
124 |
size_t last_batch_size; |
|
125 |
||
126 |
// temporary buffer for interleaved plan
|
|
127 |
cl_mem tempmemobj; |
|
128 |
||
129 |
// temporary buffer for planner plan. Only one of tempmemobj or
|
|
130 |
// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
|
|
131 |
// data format of plan (plannar or interleaved)
|
|
132 |
cl_mem tempmemobj_real, tempmemobj_imag; |
|
133 |
||
134 |
// Maximum size of signal for which local memory transposed based
|
|
135 |
// fft is sufficient i.e. no global mem transpose (communication)
|
|
136 |
// is needed
|
|
137 |
size_t max_localmem_fft_size; |
|
138 |
||
139 |
// Maximum work items per work group allowed. This, along with max_radix below controls
|
|
140 |
// maximum local memory being used by fft kernels of this plan. Set to 256 by default
|
|
141 |
size_t max_work_item_per_workgroup; |
|
142 |
||
143 |
// Maximum base radix for local memory fft ... this controls the maximum register
|
|
144 |
// space used by work items. Currently defaults to 16
|
|
145 |
size_t max_radix; |
|
146 |
||
147 |
// Device depended parameter that tells how many work-items need to be read consecutive
|
|
148 |
// values to make sure global memory access by work-items of a work-group result in
|
|
149 |
// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
|
|
150 |
size_t min_mem_coalesce_width; |
|
151 |
||
152 |
// Number of local memory banks. This is used to geneate kernel with local memory
|
|
153 |
// transposes with appropriate padding to avoid bank conflicts to local memory
|
|
154 |
// e.g. on NVidia it is 16.
|
|
155 |
size_t num_local_mem_banks; |
|
156 |
}cl_fft_plan; |
|
157 |
||
158 |
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); |
|
159 |
||
160 |
#endif
|
|
161 |