cuElim/include/header.cuh
2024-09-05 23:46:07 +08:00

123 lines
3.6 KiB
Plaintext
Executable File

#ifndef HEADER_CUH
#define HEADER_CUH
#include <iostream>
#include <cassert>
// #include <fstream> // matrix
#include <set> // gf28
#include <random> // matrix
// #include <map>
// #include <vector>
// #include <algorithm>
// #include <numeric>
// #include <omp.h>
using namespace std;
using base_t = uint64_t;
using gf28_t = uint8_t;
static const size_t base_deg = 8;
static const size_t base_num = 8;
static const size_t base_len = 64;
static_assert(base_len == base_deg * base_num && base_len == sizeof(base_t) * 8);
static const base_t base_zero = (base_t)0x00'00'00'00'00'00'00'00;
static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01;
static const gf28_t gf28_zero = (gf28_t)0x00;
static const gf28_t gf28_one = (gf28_t)0x01;
static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;
static const base_t base_deg_mask[8] = {
(base_t)0x00'00'00'00'00'00'00'FF,
(base_t)0x00'00'00'00'00'00'FF'00,
(base_t)0x00'00'00'00'00'FF'00'00,
(base_t)0x00'00'00'00'FF'00'00'00,
(base_t)0x00'00'00'FF'00'00'00'00,
(base_t)0x00'00'FF'00'00'00'00'00,
(base_t)0x00'FF'00'00'00'00'00'00,
(base_t)0xFF'00'00'00'00'00'00'00};
static const size_t THREAD_X = 32; // 列
static const size_t THREAD_Y = base_deg; // 行
__constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg];
__host__ __device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
{
return base + r * pitch + w;
}
__host__ __device__ inline size_t offset(size_t idx)
{
return idx << 3;
}
__host__ __device__ inline gf28_t get8(base_t src, size_t idx)
{
return (gf28_t)(src >> offset(idx));
}
// 确保set8对应位置的值为0
__host__ __device__ inline void set8(base_t &dst, size_t idx, gf28_t src)
{
dst |= (base_t)src << offset(idx);
}
__host__ inline void del8(base_t &dst, size_t idx)
{
dst &= ~base_deg_mask[idx];
}
__device__ inline base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
{
if (val == 0)
{
return base_zero;
}
base_t temp = base_zero;
for (size_t i = offset; i < base_num; i++)
{
set8(temp, i, d_mul_table[val][get8(base, i)]);
}
return temp;
}
__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_pitch, base_t *src, size_t s_pitch, size_t width, size_t nrows)
{
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
if (w >= width || r >= nrows)
{
return;
}
gf28_t val = get8(r, 0);
base_t s = *at_pitch(src, s_pitch, get8(r, 1), w);
base_t d = mul_base(val, s);
*at_pitch(r_tb, tb_pitch, r, w) = d;
}
__host__ inline base_t rev8(base_t n)
{
n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8;
n = (n & 0xffff0000ffff0000ul) >> 16 | (n & 0x0000ffff0000fffful) << 16;
return n >> 32 | n << 32;
}
#define CUDA_CHECK(call) \
do \
{ \
cudaError_t err = call; \
if (err != cudaSuccess) \
{ \
fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#endif