/opencl/oclfft

To get this branch, use:
bzr branch http://darksoft.org/webbzr/opencl/oclfft
1 by Matthias Vogelgesang
Initial commit
1
//
2
// File:       fft_internal.h
3
//
4
// Version:    <1.0>
5
//
6
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
7
//             in consideration of your agreement to the following terms, and your use,
8
//             installation, modification or redistribution of this Apple software
9
//             constitutes acceptance of these terms.  If you do not agree with these
10
//             terms, please do not use, install, modify or redistribute this Apple
11
//             software.
12
//
13
//             In consideration of your agreement to abide by the following terms, and
14
//             subject to these terms, Apple grants you a personal, non - exclusive
15
//             license, under Apple's copyrights in this original Apple software ( the
16
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
17
//             Software, with or without modifications, in source and / or binary forms;
18
//             provided that if you redistribute the Apple Software in its entirety and
19
//             without modifications, you must retain this notice and the following text
20
//             and disclaimers in all such redistributions of the Apple Software. Neither
21
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
22
//             endorse or promote products derived from the Apple Software without specific
23
//             prior written permission from Apple.  Except as expressly stated in this
24
//             notice, no other rights or licenses, express or implied, are granted by
25
//             Apple herein, including but not limited to any patent rights that may be
26
//             infringed by your derivative works or by other works in which the Apple
27
//             Software may be incorporated.
28
//
29
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
30
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
31
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
32
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
33
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
34
//
35
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
36
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
37
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
39
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
40
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
41
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42
//
43
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
44
//
45
////////////////////////////////////////////////////////////////////////////////////////////////////
46
47
48
#ifndef __CLFFT_INTERNAL_H
49
#define __CLFFT_INTERNAL_H
50
51
#include "clFFT.h"
52
#include <sstream>
53
54
using namespace std;
55
56
typedef enum kernel_dir_t
57
{
58
	cl_fft_kernel_x,
59
	cl_fft_kernel_y,
60
	cl_fft_kernel_z
61
}cl_fft_kernel_dir;
62
63
typedef struct kernel_info_t
64
{
65
	cl_kernel kernel;
66
	char *kernel_name;
67
	size_t lmem_size;
68
	size_t num_workgroups;
69
    size_t num_xforms_per_workgroup;
70
	size_t num_workitems_per_workgroup;
71
	cl_fft_kernel_dir dir;
72
	int in_place_possible;
73
	kernel_info_t *next;
74
}cl_fft_kernel_info;
75
76
typedef struct 
77
{
78
	// context in which fft resources are created and kernels are executed
79
	cl_context              context;
80
	
81
	// size of signal
82
	clFFT_Dim3              n;
83
	
84
	// dimension of transform ... must be either 1D, 2D or 3D
85
	clFFT_Dimension			dim;
86
	
87
	// data format ... must be either interleaved or plannar
88
	clFFT_DataFormat		format;
89
	
90
	// string containing kernel source. Generated at runtime based on
91
	// n, dim, format and other parameters
92
	string                  *kernel_string;
93
	
94
	// CL program containing source and kernel this particular 
95
	// n, dim, data format
96
	cl_program				program;
97
	
98
	// linked list of kernels which needs to be executed for this fft
99
	cl_fft_kernel_info		*kernel_info;
100
	
101
	// number of kernels
102
	int                     num_kernels;
103
	
104
	// twist kernel for virtualizing fft of very large sizes that do not
105
	// fit in GPU global memory
106
	cl_kernel				twist_kernel;
107
	
108
	// flag indicating if temporary intermediate buffer is needed or not.
109
	// this depends on fft kernels being executed and if transform is 
110
	// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... 
111
	// one that does not require global transpose do not need temporary buffer)
112
	// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
113
	// If temp buffer is needed, its allocation is lazy i.e. its not allocated
114
	// until its needed
115
	cl_int                  temp_buffer_needed;
116
	
117
	// Batch size is runtime parameter and size of temporary buffer (if needed)
118
	// depends on batch size. Allocation of temporary buffer is lazy i.e. its
119
	// only created when needed. Once its created at first call of clFFT_Executexxx
120
	// it is not allocated next time if next time clFFT_Executexxx is called with 
121
	// batch size different than the first call. last_batch_size caches the last
122
	// batch size with which this plan is used so that we dont keep allocating/deallocating
123
	// temp buffer if same batch size is used again and again.
124
	size_t                  last_batch_size;
125
	
126
	// temporary buffer for interleaved plan
127
	cl_mem   				tempmemobj;
128
	
129
	// temporary buffer for planner plan. Only one of tempmemobj or 
130
	// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending 
131
	// data format of plan (plannar or interleaved)
132
	cl_mem                  tempmemobj_real, tempmemobj_imag;
133
	
134
	// Maximum size of signal for which local memory transposed based
135
	// fft is sufficient i.e. no global mem transpose (communication)
136
	// is needed
137
	size_t					max_localmem_fft_size;
138
	
139
	// Maximum work items per work group allowed. This, along with max_radix below controls 
140
	// maximum local memory being used by fft kernels of this plan. Set to 256 by default
141
	size_t                  max_work_item_per_workgroup;
142
	
143
	// Maximum base radix for local memory fft ... this controls the maximum register 
144
	// space used by work items. Currently defaults to 16
145
	size_t                  max_radix;
146
	
147
	// Device depended parameter that tells how many work-items need to be read consecutive
148
	// values to make sure global memory access by work-items of a work-group result in 
149
	// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
150
	size_t                  min_mem_coalesce_width;
151
	
152
	// Number of local memory banks. This is used to geneate kernel with local memory 
153
	// transposes with appropriate padding to avoid bank conflicts to local memory
154
	// e.g. on NVidia it is 16.
155
	size_t                  num_local_mem_banks;
156
}cl_fft_plan;
157
158
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
159
160
#endif  
161