/tomo/pyhst

To get this branch, use:
bzr branch http://darksoft.org/webbzr/tomo/pyhst

« back to all changes in this revision

Viewing changes to docs/optimizations/kepler/approximation.txt

  • Committer: Suren A. Chilingaryan
  • Date: 2012-05-10 15:06:33 UTC
  • Revision ID: csa@dside.dyndns.org-20120510150633-56gdy6t3tflz2gab
OpenCL clean-up

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
Normal:
 
2
                        float fsubh1 = floorf(subh1);
 
3
                        int idx1 = (int)fsubh1;
 
4
                        fsubh1 = subh1 - fsubh1;
 
5
                        
 
6
                        float fsubh2 = floorf(subh2);
 
7
                        int idx2 = (int)fsubh2;
 
8
                        fsubh2 = subh2 - fsubh2;
 
9
 
 
10
Tricky:
 
11
                        float fsubh1 = subh1 + exp23;
 
12
                        int idx1 = (*(int*)(&fsubh1)) - 0x4B000000;
 
13
                        fsubh1 = subh1 - (fsubh1 - exp23);
 
14
 
 
15
                        float fsubh2 = subh2 + exp23;
 
16
                        int idx2 = (*(int*)(&fsubh2)) - 0x4B000000;
 
17
                        fsubh2 = subh2 - (fsubh2 - exp23);
 
18
 
 
19
 
 
20
Assembler (simple):
 
21
                        asm(
 
22
                            "add.f32 %0, %3, %4;                \n\t"
 
23
                            "sub.f32 %2, %0, %4;                \n\t"
 
24
                            "fma.f32.rm  %0, %2, %6, %4;        \n\t"
 
25
                            "mov.b32 %1, %0;                    \n\t"
 
26
                            "sub.u32 %1, %1, %5;                \n\t"
 
27
                            "sub.f32 %0, %3, %2;                \n\t"
 
28
                        : "=f" (fsubh1), "=r" (idx1), "=f" (isubh1)
 
29
                        : "f" (subh1), "f" (exp23), "r" (exp4b), "f" (eight)
 
30
                        );
 
31
 
 
32
 
 
33
                        asm(
 
34
                            "add.f32 %0, %3, %4;                \n\t"
 
35
                            "sub.f32 %2, %0, %4;                \n\t"
 
36
                            "fma.f32.rm  %0, %2, %6, %4;        \n\t"
 
37
                            "mov.b32 %1, %0;                    \n\t"
 
38
                            "sub.u32 %1, %1, %5;                \n\t"
 
39
                            "sub.f32 %0, %3, %2;                \n\t"
 
40
                        : "=f" (fsubh2), "=r" (idx2), "=f" (isubh2)
 
41
                        : "f" (subh2), "f" (exp23), "r" (exp4b), "f" (eight)
 
42
                        );
 
43
 
 
44
Assembler (fma):
 
45
                        asm(
 
46
                            "add.f32 %0, %3, %4;                \n\t"
 
47
                            "sub.f32 %2, %0, %4;                \n\t"
 
48
                            "fma.f32.rm  %0, %2, %6, %4;        \n\t"
 
49
                            "mov.b32 %1, %0;                    \n\t"
 
50
                            "sub.u32 %1, %1, %5;                \n\t"
 
51
                            "sub.f32 %0, %3, %2;                \n\t"
 
52
                        : "=f" (fsubh1), "=r" (idx1), "=f" (isubh1)
 
53
                        : "f" (subh1), "f" (exp23), "r" (exp4b), "f" (eight)
 
54
                        );
 
55
 
 
56
 
 
57
                        asm(
 
58
                            "add.f32 %0, %3, %4;                \n\t"
 
59
                            "sub.f32 %2, %0, %4;                \n\t"
 
60
                            "fma.f32.rm  %0, %2, %6, %4;        \n\t"
 
61
                            "mov.b32 %1, %0;                    \n\t"
 
62
                            "sub.u32 %1, %1, %5;                \n\t"
 
63
                            "sub.f32 %0, %3, %2;                \n\t"
 
64
                        : "=f" (fsubh2), "=r" (idx2), "=f" (isubh2)
 
65
                        : "f" (subh2), "f" (exp23), "r" (exp4b), "f" (eight)
 
66
                        );
 
67
 
 
68
Assembler (joint):
 
69
                        asm(
 
70
                            "add.f32 %0, %6, %7;                \n\t"
 
71
                            "add.f32 %3, %10, %11;              \n\t"
 
72
                            "sub.f32 %2, %0, %7;                \n\t"
 
73
                            "sub.f32 %5, %3, %11;               \n\t"
 
74
                            "fma.f32.rm  %0, %2, %9, %7;        \n\t"
 
75
                            "fma.f32.rm  %3, %5, %13, %11;      \n\t"
 
76
                            "mov.b32 %1, %0;                    \n\t"
 
77
                            "mov.b32 %4, %3;                    \n\t"
 
78
                            "sub.u32 %1, %1, %8;                \n\t"
 
79
                            "sub.u32 %4, %4, %12;               \n\t"
 
80
                            "sub.f32 %0, %6, %2;                \n\t"
 
81
                            "sub.f32 %3, %10, %9;               \n\t"
 
82
                        : "=f" (fsubh1), "=r" (idx1), "=f" (isubh1),
 
83
                        "=f" (fsubh2), "=r" (idx2), "=f" (isubh2)
 
84
                        : "f" (subh1), "f" (exp23), "r" (exp4b), "f" (eight),
 
85
                        "f" (subh2), "f" (exp23), "r" (exp4b), "f" (eight)
 
86
                        );