forked from RPCS3/rpcs3
-
Notifications
You must be signed in to change notification settings - Fork 4
/
GNU.h
394 lines (320 loc) · 11.2 KB
/
GNU.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
#pragma once
#include <emmintrin.h>
// temporarily (until noexcept is available); use `noexcept(true)` instead of `noexcept` if necessary
#if defined(_MSC_VER) && _MSC_VER <= 1800
#define noexcept _NOEXCEPT_OP
#endif
#if defined(_MSC_VER) && _MSC_VER <= 1800
#define thread_local __declspec(thread)
#elif __APPLE__
#define thread_local __thread
#endif
#if defined(_MSC_VER)
#define never_inline __declspec(noinline)
#else
#define never_inline __attribute__((noinline))
#endif
#if defined(_MSC_VER)
#define safe_buffers __declspec(safebuffers)
#else
#define safe_buffers
#endif
#if defined(_MSC_VER)
#define force_inline __forceinline
#else
#define force_inline __attribute__((always_inline))
#endif
#if defined(_MSC_VER)
#define set_alignment(x) _CRT_ALIGN(x)
#else
#define set_alignment(x) __attribute__((aligned(x)))
#endif
#if defined(__GNUG__)
#include <stdlib.h>
#include <cstdint>
#ifndef __APPLE__
#include <malloc.h>
#endif
#define _fpclass(x) std::fpclassify(x)
#define _byteswap_ushort(x) __builtin_bswap16(x)
#define _byteswap_ulong(x) __builtin_bswap32(x)
#define _byteswap_uint64(x) __builtin_bswap64(x)
#define INFINITE 0xFFFFFFFF
inline uint64_t __umulh(uint64_t a, uint64_t b)
{
uint64_t result;
__asm__("mulq %[b]" : "=d" (result) : [a] "a" (a), [b] "rm" (b));
return result;
}
inline int64_t __mulh(int64_t a, int64_t b)
{
int64_t result;
__asm__("imulq %[b]" : "=d" (result) : [a] "a" (a), [b] "rm" (b));
return result;
}
#ifdef __APPLE__
int clock_gettime(int foo, struct timespec *ts);
#define wxIsNaN(x) ((x) != (x))
#ifndef CLOCK_MONOTONIC
#define CLOCK_MONOTONIC 0
#endif /* !CLOCK_MONOTONIC */
#endif /* __APPLE__ */
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, T> sync_val_compare_and_swap(volatile T* dest, T2 comp, T2 exch)
{
return __sync_val_compare_and_swap(dest, comp, exch);
}
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, bool> sync_bool_compare_and_swap(volatile T* dest, T2 comp, T2 exch)
{
return __sync_bool_compare_and_swap(dest, comp, exch);
}
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, T> sync_lock_test_and_set(volatile T* dest, T2 value)
{
return __sync_lock_test_and_set(dest, value);
}
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, T> sync_fetch_and_add(volatile T* dest, T2 value)
{
return __sync_fetch_and_add(dest, value);
}
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, T> sync_fetch_and_sub(volatile T* dest, T2 value)
{
return __sync_fetch_and_sub(dest, value);
}
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, T> sync_fetch_and_or(volatile T* dest, T2 value)
{
return __sync_fetch_and_or(dest, value);
}
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, T> sync_fetch_and_and(volatile T* dest, T2 value)
{
return __sync_fetch_and_and(dest, value);
}
template<typename T, typename T2> inline std::enable_if_t<std::is_arithmetic<T>::value, T> sync_fetch_and_xor(volatile T* dest, T2 value)
{
return __sync_fetch_and_xor(dest, value);
}
#endif /* __GNUG__ */
#if defined(_MSC_VER)
// atomic compare and swap functions
inline uint8_t sync_val_compare_and_swap(volatile uint8_t* dest, uint8_t comp, uint8_t exch)
{
return _InterlockedCompareExchange8((volatile char*)dest, exch, comp);
}
inline uint16_t sync_val_compare_and_swap(volatile uint16_t* dest, uint16_t comp, uint16_t exch)
{
return _InterlockedCompareExchange16((volatile short*)dest, exch, comp);
}
inline uint32_t sync_val_compare_and_swap(volatile uint32_t* dest, uint32_t comp, uint32_t exch)
{
return _InterlockedCompareExchange((volatile long*)dest, exch, comp);
}
inline uint64_t sync_val_compare_and_swap(volatile uint64_t* dest, uint64_t comp, uint64_t exch)
{
return _InterlockedCompareExchange64((volatile long long*)dest, exch, comp);
}
inline bool sync_bool_compare_and_swap(volatile uint8_t* dest, uint8_t comp, uint8_t exch)
{
return (uint8_t)_InterlockedCompareExchange8((volatile char*)dest, exch, comp) == comp;
}
inline bool sync_bool_compare_and_swap(volatile uint16_t* dest, uint16_t comp, uint16_t exch)
{
return (uint16_t)_InterlockedCompareExchange16((volatile short*)dest, exch, comp) == comp;
}
inline bool sync_bool_compare_and_swap(volatile uint32_t* dest, uint32_t comp, uint32_t exch)
{
return (uint32_t)_InterlockedCompareExchange((volatile long*)dest, exch, comp) == comp;
}
inline bool sync_bool_compare_and_swap(volatile uint64_t* dest, uint64_t comp, uint64_t exch)
{
return (uint64_t)_InterlockedCompareExchange64((volatile long long*)dest, exch, comp) == comp;
}
// atomic exchange functions
inline uint8_t sync_lock_test_and_set(volatile uint8_t* dest, uint8_t value)
{
return _InterlockedExchange8((volatile char*)dest, value);
}
inline uint16_t sync_lock_test_and_set(volatile uint16_t* dest, uint16_t value)
{
return _InterlockedExchange16((volatile short*)dest, value);
}
inline uint32_t sync_lock_test_and_set(volatile uint32_t* dest, uint32_t value)
{
return _InterlockedExchange((volatile long*)dest, value);
}
inline uint64_t sync_lock_test_and_set(volatile uint64_t* dest, uint64_t value)
{
return _InterlockedExchange64((volatile long long*)dest, value);
}
// atomic add functions
inline uint8_t sync_fetch_and_add(volatile uint8_t* dest, uint8_t value)
{
return _InterlockedExchangeAdd8((volatile char*)dest, value);
}
inline uint16_t sync_fetch_and_add(volatile uint16_t* dest, uint16_t value)
{
return _InterlockedExchangeAdd16((volatile short*)dest, value);
}
inline uint32_t sync_fetch_and_add(volatile uint32_t* dest, uint32_t value)
{
return _InterlockedExchangeAdd((volatile long*)dest, value);
}
inline uint64_t sync_fetch_and_add(volatile uint64_t* dest, uint64_t value)
{
return _InterlockedExchangeAdd64((volatile long long*)dest, value);
}
// atomic sub functions
inline uint8_t sync_fetch_and_sub(volatile uint8_t* dest, uint8_t value)
{
return _InterlockedExchangeAdd8((volatile char*)dest, -(char)value);
}
inline uint16_t sync_fetch_and_sub(volatile uint16_t* dest, uint16_t value)
{
return _InterlockedExchangeAdd16((volatile short*)dest, -(short)value);
}
inline uint32_t sync_fetch_and_sub(volatile uint32_t* dest, uint32_t value)
{
return _InterlockedExchangeAdd((volatile long*)dest, -(long)value);
}
inline uint64_t sync_fetch_and_sub(volatile uint64_t* dest, uint64_t value)
{
return _InterlockedExchangeAdd64((volatile long long*)dest, -(long long)value);
}
// atomic `bitwise or` functions
inline uint8_t sync_fetch_and_or(volatile uint8_t* dest, uint8_t value)
{
return _InterlockedOr8((volatile char*)dest, value);
}
inline uint16_t sync_fetch_and_or(volatile uint16_t* dest, uint16_t value)
{
return _InterlockedOr16((volatile short*)dest, value);
}
inline uint32_t sync_fetch_and_or(volatile uint32_t* dest, uint32_t value)
{
return _InterlockedOr((volatile long*)dest, value);
}
inline uint64_t sync_fetch_and_or(volatile uint64_t* dest, uint64_t value)
{
return _InterlockedOr64((volatile long long*)dest, value);
}
// atomic `bitwise and` functions
inline uint8_t sync_fetch_and_and(volatile uint8_t* dest, uint8_t value)
{
return _InterlockedAnd8((volatile char*)dest, value);
}
inline uint16_t sync_fetch_and_and(volatile uint16_t* dest, uint16_t value)
{
return _InterlockedAnd16((volatile short*)dest, value);
}
inline uint32_t sync_fetch_and_and(volatile uint32_t* dest, uint32_t value)
{
return _InterlockedAnd((volatile long*)dest, value);
}
inline uint64_t sync_fetch_and_and(volatile uint64_t* dest, uint64_t value)
{
return _InterlockedAnd64((volatile long long*)dest, value);
}
// atomic `bitwise xor` functions
inline uint8_t sync_fetch_and_xor(volatile uint8_t* dest, uint8_t value)
{
return _InterlockedXor8((volatile char*)dest, value);
}
inline uint16_t sync_fetch_and_xor(volatile uint16_t* dest, uint16_t value)
{
return _InterlockedXor16((volatile short*)dest, value);
}
inline uint32_t sync_fetch_and_xor(volatile uint32_t* dest, uint32_t value)
{
return _InterlockedXor((volatile long*)dest, value);
}
inline uint64_t sync_fetch_and_xor(volatile uint64_t* dest, uint64_t value)
{
return _InterlockedXor64((volatile long long*)dest, value);
}
#endif /* _MSC_VER */
inline uint32_t cntlz32(uint32_t arg)
{
#if defined(_MSC_VER)
unsigned long res;
if (!_BitScanReverse(&res, arg))
{
return 32;
}
else
{
return res ^ 31;
}
#else
if (arg)
{
return __builtin_clzll((uint64_t)arg) - 32;
}
else
{
return 32;
}
#endif
}
inline uint64_t cntlz64(uint64_t arg)
{
#if defined(_MSC_VER)
unsigned long res;
if (!_BitScanReverse64(&res, arg))
{
return 64;
}
else
{
return res ^ 63;
}
#else
if (arg)
{
return __builtin_clzll(arg);
}
else
{
return 64;
}
#endif
}
// compare 16 packed unsigned bytes (greater than)
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
{
// (A xor 0x80) > (B xor 0x80)
const auto sign = _mm_set1_epi32(0x80808080);
return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
{
const auto sign = _mm_set1_epi32(0x80008000);
return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
{
const auto sign = _mm_set1_epi32(0x80000000);
return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128 sse_exp2_ps(__m128 A)
{
const auto x0 = _mm_max_ps(_mm_min_ps(A, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
const auto x4 = _mm_mul_ps(x3, x3);
const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
}
inline __m128 sse_log2_ps(__m128 A)
{
const auto _1 = _mm_set1_ps(1.0f);
const auto _c = _mm_set1_ps(1.442695040f);
const auto x0 = _mm_max_ps(A, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
const auto x4 = _mm_add_ps(x3, x3);
const auto x5 = _mm_mul_ps(x4, x4);
const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
}