cuElim/include/header.cuh

#ifndef HEADER_CUH
#define HEADER_CUH

#include <iostream>
#include <cassert>
// #include <fstream> // matrix

#include <set>    // gf28
#include <random> // matrix
// #include <map>
// #include <vector>

// #include <algorithm>
// #include <numeric>
// #include <omp.h>

using namespace std;

using base_t = uint64_t;
using gf28_t = uint8_t;

static const size_t base_deg = 8;
static const size_t base_num = 8;
static const size_t base_len = 64;
static_assert(base_len == base_deg * base_num && base_len == sizeof(base_t) * 8);

static const base_t base_zero = (base_t)0x00'00'00'00'00'00'00'00;
static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01;
static const gf28_t gf28_zero = (gf28_t)0x00;
static const gf28_t gf28_one = (gf28_t)0x01;

static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;
static const base_t base_deg_mask[8] = {
    (base_t)0x00'00'00'00'00'00'00'FF,
    (base_t)0x00'00'00'00'00'00'FF'00,
    (base_t)0x00'00'00'00'00'FF'00'00,
    (base_t)0x00'00'00'00'FF'00'00'00,
    (base_t)0x00'00'00'FF'00'00'00'00,
    (base_t)0x00'00'FF'00'00'00'00'00,
    (base_t)0x00'FF'00'00'00'00'00'00,
    (base_t)0xFF'00'00'00'00'00'00'00};

static const size_t THREAD_X = 32;       // 列
static const size_t THREAD_Y = base_deg; // 行

__constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg];

__host__ __device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
{
    return base + r * pitch + w;
}

__host__ __device__ inline size_t offset(size_t idx)
{
    return idx << 3;
}

__host__ __device__ inline gf28_t get8(base_t src, size_t idx)
{
    return (gf28_t)(src >> offset(idx));
}

// 确保set8对应位置的值为0
__host__ __device__ inline void set8(base_t &dst, size_t idx, gf28_t src)
{
    dst |= (base_t)src << offset(idx);
}

__host__ inline void del8(base_t &dst, size_t idx)
{
    dst &= ~base_deg_mask[idx];
}

__device__ inline base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
{
    if (val == 0)
    {
        return base_zero;
    }
    base_t temp = base_zero;
    for (size_t i = offset; i < base_num; i++)
    {
        set8(temp, i, d_mul_table[val][get8(base, i)]);
    }
    return temp;
}

__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_pitch, base_t *src, size_t s_pitch, size_t width, size_t nrows)
{
    size_t w = blockIdx.x * blockDim.x + threadIdx.x;
    size_t r = blockIdx.y * blockDim.y + threadIdx.y;

    if (w >= width || r >= nrows)
    {
        return;
    }

    gf28_t val = get8(r, 0);
    base_t s = *at_pitch(src, s_pitch, get8(r, 1), w);
    base_t d = mul_base(val, s);
    *at_pitch(r_tb, tb_pitch, r, w) = d;
}

__host__ inline base_t rev8(base_t n)
{
    n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8;
    n = (n & 0xffff0000ffff0000ul) >> 16 | (n & 0x0000ffff0000fffful) << 16;
    return n >> 32 | n << 32;
}

#define CUDA_CHECK(call)                                                 \
    do                                                                   \
    {                                                                    \
        cudaError_t err = call;                                          \
        if (err != cudaSuccess)                                          \
        {                                                                \
            fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err));        \
            exit(EXIT_FAILURE);                                          \
        }                                                                \
    } while (0)

#endif