ae2f_docs
Slp.h
1#ifndef Slp_h
2#define Slp_h
3
4#define ae2f_NEED_CLASS 0
5
6#include "mac.h"
7#include <ae2f/Ann/Slp.core.h>
8#include <ae2f/Ann/Slp.auto.h>
9
10
12#define _clAtomAddF_tmpl(a, b, c, d, e) clAtomAddF_tmpl(d, e)
13#define _clAtomAddF(a, b, c, d) clAtomAddF(b, c, d)
14#endif
15
16#define _clAtomAddF_t(__global, host_float_t) struct {
17 union {
18 host_float_t m_f;
19 host_float_t m_fa[MAX(1, 4 / sizeof(host_float_t))];
20 uint m_u[(MAX(1, sizeof(host_float_t) >> 2))];
21 } m_atom[2];
22 uint m_count;
23 union {
24 __global volatile uint* m_pchg;
25 __global volatile host_float_t* m_fp;
26 intptr_t m_ip;
27 } m_U0; \
28}
29
31typedef _clAtomAddF_t(__global, host_float_t) clAtomAddF_t;
32#endif
33
34ae2f_MAC(__global, ) clAtomAddF(clAtomAddF_t v_mem, __global volatile host_float_t* prm_dst, ae2f_float_t prm_val)
35{
36 if(sizeof((v_mem).m_atom[0].m_f) < 4) {
37 (v_mem).m_count = 4 / sizeof((v_mem).m_atom[0].m_f);
38 while((v_mem).m_count--) {
39 (v_mem).m_atom[0].m_fa[(v_mem).m_count]
40 = (prm_dst)[(v_mem).m_count]
41 ;
42 }
43 } else {
44 ((v_mem).m_atom)[0].m_f = ((v_mem).m_atom)[1].m_f = *(prm_dst);
45 }
46
47 ((v_mem).m_atom)[1].m_f += (prm_val);
48
49 (v_mem).m_count = MAX(1, (sizeof(((v_mem).m_atom[0].m_f)) >> 2));
50 while((v_mem).m_count--) {
51 (v_mem).m_U0.m_fp = (prm_dst);
52 (v_mem).m_U0.m_pchg += (v_mem).m_count;
53 atom_xchg_u(
54 (v_mem).m_U0.m_pchg
55 , (v_mem).m_atom[1].m_u[(v_mem).m_count]
56 );
57 }
58}
59
61#define _clSlpPredict_t _clAtomAddF_t
62typedef clAtomAddF_t clSlpPredict_t;
63#endif
64
65ae2f_MAC(__global, ) clSlpPredict(
66 clSlpPredict_t v_mem,
67 ae2f_float_t ret,
68 __local ae2f_float_t* const loc,
69 const __global ae2f_float_t* const p_inp,
70 const __global ae2f_float_t* const p_weight,
71 const __global ae2f_float_t* const p_bias,
72 const size_t iidx, const size_t isz,
73 const size_t oidx, const size_t osz,
74 ae2f_AnnActFFN_t ACT
75 )
76{
77 if((oidx) < (osz) && (iidx) < (isz)) {
78 unless((iidx)) (loc)[oidx] = 0;
79
80 if(sizeof((v_mem).m_atom[0].m_f) >= 4) {
81 _clAtomAddF(__global
82 , v_mem, &(loc)[oidx]
83 , (p_weight)[(oidx) * (isz) + (iidx)] * (p_inp)[iidx]
84 );
85 } else {
86 unless((oidx) & 1) {
87 (v_mem).m_atom[0].m_u[0] =
88 CAST(__global uint*, loc)[0];
89
90 (v_mem).m_atom[1].m_fa[0]
91 = (v_mem).m_atom[1].m_fa[0]
92 + (p_weight)[(oidx) * (isz) + (iidx)] * (p_inp)[iidx]
93 ;
94
95 if((oidx) + 1 < (osz))
96 (v_mem).m_atom[1].m_fa[1]
97 = (v_mem).m_atom[1].m_fa[1]
98 + (p_weight)[((oidx) + 1) * (isz) + (iidx)] * (p_inp)[iidx]
99 ;
100
101 atom_xchg_u(CAST(__global uint*, loc), (v_mem).m_atom[1].m_u[0]);
102 }
103 }
104
105 unless(iidx) {
106 (loc)[oidx] += (p_bias)[oidx];
107 ACT(&(ret), (loc), oidx, osz);
108 }
109 }
110
111}
112
113
114#endif
#define unless(...)
Invokes when condition is false.
Definition Cast.h:103
#define __ae2f_MACRO_GENERATED
Definition Conv.auto.h:2
#define __global
Definition addrspec.h:8
#define __local
Definition addrspec.h:10
#define size_t
Definition mac.h:20
#define CAST(t, x)
Definition mac.h:16
#define host_float_t
Definition mac.h:9
#define ae2f_MAC(...)
Definition mac.h:28
#define MAX(a, b)
Definition mac.h:18
#define uint
Definition sclr.h:11
#define _clAtomAddF(__global, v_mem, prm_dst, prm_val)
Definition Slp.auto.h:42
#define _clAtomAddF_t(__global, host_float_t)
Definition Slp.auto.h:24