初步完成，但速度很慢

2024-09-05 16:56:58 +08:00 · 2024-09-05 16:56:58 +08:00 · 06942fbc8e
commit 06942fbc8e
13 changed files with 587 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+build/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 3.24)  # 设置cmake最低版本
+
+# 设置 C++ 标准
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+# 设置 CUDA C++ 标准
+set(CMAKE_CUDA_STANDARD 20)
+set(CMAKE_CUDA_STANDARD_REQUIRED True)
+set(CMAKE_CUDA_ARCHITECTURES native)  # 设置CUDA架构
+
+# 设置编译选项
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
+
+project(cuElim_GF28 LANGUAGES CXX CUDA)  # 设置项目名和语言
+
+include_directories(${PROJECT_SOURCE_DIR}/include)  # 添加头文件目录
+
+find_package(OpenMP REQUIRED) # 查找 OpenMP 包
+
+add_executable(cuelim ./src/main.cu)  # 添加可执行文件
+target_link_libraries(cuelim OpenMP::OpenMP_CXX)  # 链接 OpenMP 库
+
+enable_testing()  # 启动ctest测试
+add_subdirectory(test)  # 添加测试目录
+
+# add_subdirectory(benchmark)  # 添加性能测试目录
--- a/include/cuelim.cuh
+++ b/include/cuelim.cuh
@ -0,0 +1,6 @@
+#ifndef CUELIM_CUH
+#define CUELIM_CUH
+
+#include "multiplication.cuh"
+
+#endif
--- a/include/elimination.cuh
+++ b/include/elimination.cuh
--- a/include/gf28.cuh
+++ b/include/gf28.cuh
@ -0,0 +1,95 @@
+#ifndef GF28_CUH
+#define GF28_CUH
+
+#include "header.cuh"
+
+static const set<base_t> irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9};
+
+class GF28
+{
+public:
+    GF28(base_t poly)
+    {
+        assert(irreducible_polynomials_degree_08.count(poly) == 1);
+        this->poly = poly;
+        for (size_t x = 0; x < (1 << base_deg); x++)
+        {
+            mul_table[x][gf28_zero] = gf28_zero;
+            for (size_t d = 0; d < base_deg; d++)
+            {
+                gf28_t val = shift_left(x, d);
+                for (size_t y = (1 << d); y < (1 << (d + 1)); y++)
+                {
+                    mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)];
+                    if (mul_table[x][y] == gf28_one)
+                    {
+                        inv_table[x] = y;
+                    }
+                }
+            }
+        }
+        inv_table[gf28_zero] = gf28_zero;
+    }
+
+    gf28_t mul(const gf28_t x, const gf28_t y) const
+    {
+        return mul_table[x][y];
+    }
+
+    base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0) const
+    {
+        base_t temp = base_zero;
+        for (size_t i = offset; i < base_num; i++)
+        {
+            set8(temp, mul(val, get8(base, i)), i);
+        }
+        return temp;
+    }
+
+    gf28_t inv(gf28_t x)
+    {
+        return inv_table[x];
+    }
+
+    friend ostream &operator<<(ostream &out, const GF28 &gf);
+
+    GF28() = delete;
+    GF28(const GF28 &) = delete;
+    GF28(GF28 &&) = delete;
+    GF28 &operator=(const GF28 &) = delete;
+    GF28 &operator=(GF28 &&) = delete;
+
+    gf28_t mul_table[1 << base_deg][1 << base_deg];
+
+private:
+    gf28_t shift_left(gf28_t x, size_t d)
+    {
+        base_t temp = (base_t)x << d;
+        for (size_t i = base_deg - 1 + d; i > base_deg - 1; i--)
+        {
+            if (temp & (1 << i))
+            {
+                temp ^= poly << (i - base_deg);
+            }
+        }
+        return temp;
+    }
+
+    base_t poly;
+    gf28_t inv_table[1 << base_deg];
+};
+
+ostream &operator<<(ostream &out, const GF28 &gf)
+{
+    for (size_t x = 0; x < 1 << base_deg; x++)
+    {
+        for (size_t y = 0; y < 1 << base_deg; y++)
+        {
+            printf("%02X ", gf.mul_table[x][y]);
+        }
+        printf("\n");
+    }
+    return out;
+}
+
+#endif
--- a/include/header.cuh
+++ b/include/header.cuh
@ -0,0 +1,103 @@
+#ifndef HEADER_CUH
+#define HEADER_CUH
+
+#include <iostream>
+#include <cassert>
+#include <cuda_runtime.h>
+
+#include <set> // gf28
+// #include <map>
+// #include <vector>
+
+// #include <algorithm>
+// #include <numeric>
+#include <random> // matrix
+// #include <omp.h>
+
+using namespace std;
+
+using base_t = uint64_t;
+using gf28_t = uint8_t;
+
+static const size_t base_deg = 8;
+static const size_t base_num = 8;
+static const size_t base_len = 64;
+static_assert(base_len == base_deg * base_num && base_len == sizeof(base_t) * 8);
+
+static const base_t base_zero = (base_t)0x00'00'00'00'00'00'00'00;
+static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01;
+static const gf28_t gf28_zero = (gf28_t)0x00;
+static const gf28_t gf28_one = (gf28_t)0x01;
+
+static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;
+static const base_t base_deg_mask[8] = {
+    (base_t)0x00'00'00'00'00'00'00'FF,
+    (base_t)0x00'00'00'00'00'00'FF'00,
+    (base_t)0x00'00'00'00'00'FF'00'00,
+    (base_t)0x00'00'00'00'FF'00'00'00,
+    (base_t)0x00'00'00'FF'00'00'00'00,
+    (base_t)0x00'00'FF'00'00'00'00'00,
+    (base_t)0x00'FF'00'00'00'00'00'00,
+    (base_t)0xFF'00'00'00'00'00'00'00};
+
+static const size_t THREAD_X = 32;       // 列
+static const size_t THREAD_Y = base_deg; // 行
+
+__constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg];
+
+__device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
+{
+    return base + r * pitch + w;
+}
+
+__host__ __device__ inline size_t offset(size_t idx)
+{
+    return idx << 3;
+}
+
+__host__ __device__ inline gf28_t get8(base_t src, size_t idx)
+{
+    return (gf28_t)(src >> offset(idx));
+}
+
+// 确保set8对应位置的值为0
+__host__ __device__ inline void set8(base_t &dst, size_t idx, gf28_t src)
+{
+    dst |= (base_t)src << offset(idx);
+}
+
+__host__ inline void del8(base_t &dst, size_t idx)
+{
+    dst &= ~base_deg_mask[idx];
+}
+
+__device__ base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
+{
+    base_t temp = base_zero;
+    for (size_t i = offset; i < base_num; i++)
+    {
+        set8(temp, i, d_mul_table[val][get8(base, i)]);
+    }
+    return temp;
+}
+
+__host__ inline base_t rev8(base_t n)
+{
+    n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8;
+    n = (n & 0xffff0000ffff0000ul) >> 16 | (n & 0x0000ffff0000fffful) << 16;
+    return n >> 32 | n << 32;
+}
+
+#define CUDA_CHECK(call)                                                 \
+    do                                                                   \
+    {                                                                    \
+        cudaError_t err = call;                                          \
+        if (err != cudaSuccess)                                          \
+        {                                                                \
+            fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", \
+                    __FILE__, __LINE__, cudaGetErrorString(err));        \
+            exit(EXIT_FAILURE);                                          \
+        }                                                                \
+    } while (0)
+
+#endif
--- a/include/matrix.cuh
+++ b/include/matrix.cuh
@ -0,0 +1,192 @@
+#ifndef MATRIX_CUH
+#define MATRIX_CUH
+
+#include "header.cuh"
+#include "gf28.cuh"
+
+class GF28Matrix
+{
+public:
+    enum MatType
+    {
+        root,
+        view
+    };
+    // 只能构造root矩阵
+    GF28Matrix(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
+    {
+        width = (ncols - 1) / base_num + 1;
+        pitch = ((width - 1) / 4) * 4 + 1; // 以32字节（4*64bit）为单位对齐
+        CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * pitch * sizeof(base_t)));
+        CUDA_CHECK(cudaMemset(data, 0, nrows * pitch * sizeof(base_t)));
+    }
+    // 只能拷贝构造root矩阵
+    GF28Matrix(const GF28Matrix &m) : GF28Matrix(m.nrows, m.ncols)
+    {
+        cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault);
+    }
+    GF28Matrix(GF28Matrix &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), pitch(m.pitch), type(m.type), data(m.data)
+    {
+        m.nrows = 0;
+        m.ncols = 0;
+        m.width = 0;
+        m.pitch = 0;
+        m.type = view;
+        m.data = nullptr;
+    }
+    GF28Matrix &operator=(const GF28Matrix &m)
+    {
+        if (this == &m)
+        {
+            return *this;
+        }
+        assert(nrows == m.nrows && ncols == m.ncols);
+        cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault);
+        return *this;
+    }
+    GF28Matrix &operator=(GF28Matrix &&m) noexcept
+    {
+        if (this == &m)
+        {
+            return *this;
+        }
+        if (type == root)
+        {
+            CUDA_CHECK(cudaFree(data));
+        }
+        nrows = m.nrows;
+        ncols = m.ncols;
+        width = m.width;
+        pitch = m.pitch;
+        type = m.type;
+        data = m.data;
+        m.nrows = 0;
+        m.ncols = 0;
+        m.width = 0;
+        m.pitch = 0;
+        m.type = view;
+        m.data = nullptr;
+        return *this;
+    }
+
+    ~GF28Matrix()
+    {
+        if (type == root)
+        {
+            CUDA_CHECK(cudaFree(data));
+        }
+    }
+
+    inline base_t *at_base(size_t r, size_t w) const
+    {
+        return data + r * pitch + w;
+    }
+
+    // 只能以base_t为单位进行操作
+    GF28Matrix createView(size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) const
+    {
+        assert(begin_ri < end_rj && end_rj <= nrows && begin_wi < end_wj && end_wj <= width);
+        GF28Matrix view;
+        view.nrows = end_rj - begin_ri;
+        view.ncols = (end_wj == width ? ncols : end_wj * base_num) - begin_wi * base_num;
+        view.width = end_wj - begin_wi;
+        view.pitch = pitch;
+        view.data = at_base(begin_ri, begin_wi);
+        return view;
+    }
+
+    void randomize(base_t seed)
+    {
+        assert(type == root);
+        static default_random_engine e(seed);
+        static uniform_int_distribution<base_t> d;
+        base_t lastmask = base_fullmask >> (width * base_len - ncols * base_deg);
+        for (size_t r = 0; r < nrows; r++)
+        {
+            for (size_t w = 0; w < width; w++)
+            {
+                *at_base(r, w) = d(e);
+            }
+            *at_base(r, width - 1) &= lastmask;
+        }
+    }
+
+    bool operator==(const GF28Matrix &m) const
+    {
+        if (nrows != m.nrows || ncols != m.ncols)
+        {
+            return false;
+        }
+        for (size_t r = 0; r < nrows; r++)
+        {
+            for (size_t w = 0; w < width; w++)
+            {
+                if (*at_base(r, w) != *m.at_base(r, w))
+                {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    bool operator==(const base_t base) const
+    {
+        for (size_t r = 0; r < nrows; r++)
+        {
+            for (size_t w = 0; w < width; w++)
+            {
+                if (*at_base(r, w) != base)
+                {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+    void operator^=(const GF28Matrix &m)
+    {
+        assert(nrows == m.nrows && ncols == m.ncols);
+        for (size_t r = 0; r < nrows; r++)
+        {
+            for (size_t w = 0; w < width; w++)
+            {
+                *at_base(r, w) ^= *m.at_base(r, w);
+            }
+        }
+    }
+    GF28Matrix operator^(const GF28Matrix &m) const
+    {
+        GF28Matrix temp(*this);
+        temp ^= m;
+        return temp;
+    }
+
+    friend ostream &operator<<(ostream &out, const GF28Matrix &m);
+    void gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf);
+
+    // size_t nrows, ncols;
+    // size_t width, pitch;
+
+private:
+    GF28Matrix() : nrows(0), ncols(0), width(0), pitch(0), type(view), data(nullptr) {}
+    size_t nrows, ncols;
+    size_t width, pitch;
+    MatType type;
+    base_t *data;
+};
+
+ostream &operator<<(ostream &out, const GF28Matrix &m)
+{
+    for (size_t r = 0; r < m.nrows; r++)
+    {
+        for (size_t w = 0; w < m.width; w++)
+        {
+            printf("%016lX ", rev8(*m.at_base(r, w)));
+        }
+        printf("\n");
+    }
+    return out;
+}
+
+#endif
--- a/include/multiplication.cuh
+++ b/include/multiplication.cuh
@ -0,0 +1,52 @@
+#ifndef MULTIPLICATION_CUH
+#define MULTIPLICATION_CUH
+
+#include "matrix.cuh"
+#include "gf28.cuh"
+
+// 处理32base列的所有行
+__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *b, size_t b_pitch, base_t *c, size_t c_pitch, size_t nrows, size_t width)
+{
+    __shared__ __align__(8) base_t src[base_deg][THREAD_X];
+    size_t r = threadIdx.y;
+    size_t w = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (w >= width)
+    {
+        return;
+    }
+
+    if (r < nrows && w < width)
+        src[threadIdx.y][threadIdx.x] = *at_pitch(b, b_pitch, r, w);
+    else
+        src[threadIdx.y][threadIdx.x] = base_zero;
+
+    __syncthreads();
+
+    for (; r < nrows; r += base_deg)
+    {
+        base_t val = *at_pitch(a, a_pitch, r, 0);
+        base_t temp = base_zero;
+        for (size_t i = 0; i < base_deg; i++)
+        {
+            temp ^= mul_base(get8(val, i), src[i][threadIdx.x]);
+        }
+        *at_pitch(c, c_pitch, r, w) ^= temp;
+    }
+}
+
+__host__ void GF28Matrix::gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
+{
+    assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
+    cudaMemcpyToSymbol(d_mul_table, gf.mul_table, (1 << base_deg) * (1 << base_deg) * sizeof(gf28_t));
+    for (size_t w = 0; w < a.width; w++)
+    {
+        dim3 block(THREAD_X, THREAD_Y);
+        dim3 grid((b.width - 1) / block.x + 1);
+        cudaDeviceSynchronize();
+        gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.pitch, b.at_base(w * base_num, 0), b.pitch, data, pitch, nrows, width);
+    }
+    cudaDeviceSynchronize();
+}
+
+#endif
--- a/src/main.cu
+++ b/src/main.cu
@ -0,0 +1,11 @@
+#include "cuelim.cuh"
+
+int main()
+{
+    GF28Matrix a(10000, 10000);
+    a.randomize(123);
+    GF28Matrix b(10000, 10000);
+    b.randomize(456);
+    GF28Matrix c(10000, 10000);
+    c.gpu_addmul(a, b, GF28(0b100011101));
+}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -0,0 +1,26 @@
+find_package(GTest REQUIRED)  # 查找GTest库
+
+include_directories(${PROJECT_SOURCE_DIR}/test)  # 添加测试头文件目录
+
+set(TEST_SRC_FILES
+    "test_gf28.cu"
+    "test_matrix.cu"
+)
+
+foreach(SRC ${TEST_SRC_FILES})
+    get_filename_component(SRC_NAME ${SRC} NAME_WE)
+    add_executable(${SRC_NAME} ${SRC})
+    target_link_libraries(${SRC_NAME} GTest::GTest GTest::Main)
+    gtest_discover_tests(${SRC_NAME})
+endforeach()
+
+# set(TEST_M4RIE_SRC_FILES
+#     "test_m4rie_interface.cu"
+# )
+
+# foreach(SRC ${TEST_M4RIE_SRC_FILES})
+#     get_filename_component(SRC_NAME ${SRC} NAME_WE)
+#     add_executable(${SRC_NAME} ${SRC})
+#     target_link_libraries(${SRC_NAME} GTest::GTest GTest::Main m4ri m4rie)
+#     gtest_discover_tests(${SRC_NAME})
+# endforeach()
--- a/test/test_gf28.cu
+++ b/test/test_gf28.cu
@ -0,0 +1,29 @@
+#include <gtest/gtest.h>
+#include "test_header.cuh"
+
+vector<gf28_t> expect_inv_table{
+    0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96,
+    0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A,
+    0x6C, 0xED, 0x39, 0x51, 0x60, 0x56, 0x2C, 0x8A, 0x70, 0xD0, 0x1F, 0x4A, 0x26, 0x8B, 0x33, 0x6E,
+    0x48, 0x89, 0x6F, 0x2E, 0xA4, 0xC3, 0x40, 0x5E, 0x50, 0x22, 0xCF, 0xA9, 0xAB, 0x0C, 0x15, 0xE1,
+    0x36, 0x5F, 0xF8, 0xD5, 0x92, 0x4E, 0xA6, 0x04, 0x30, 0x88, 0x2B, 0x1E, 0x16, 0x67, 0x45, 0x93,
+    0x38, 0x23, 0x68, 0x8C, 0x81, 0x1A, 0x25, 0x61, 0x13, 0xC1, 0xCB, 0x63, 0x97, 0x0E, 0x37, 0x41,
+    0x24, 0x57, 0xCA, 0x5B, 0xB9, 0xC4, 0x17, 0x4D, 0x52, 0x8D, 0xEF, 0xB3, 0x20, 0xEC, 0x2F, 0x32,
+    0x28, 0xD1, 0x11, 0xD9, 0xE9, 0xFB, 0xDA, 0x79, 0xDB, 0x77, 0x06, 0xBB, 0x84, 0xCD, 0xFE, 0xFC,
+    0x1B, 0x54, 0xA1, 0x1D, 0x7C, 0xCC, 0xE4, 0xB0, 0x49, 0x31, 0x27, 0x2D, 0x53, 0x69, 0x02, 0xF5,
+    0x18, 0xDF, 0x44, 0x4F, 0x9B, 0xBC, 0x0F, 0x5C, 0x0B, 0xDC, 0xBD, 0x94, 0xAC, 0x09, 0xC7, 0xA2,
+    0x1C, 0x82, 0x9F, 0xC6, 0x34, 0xC2, 0x46, 0x05, 0xCE, 0x3B, 0x0D, 0x3C, 0x9C, 0x08, 0xBE, 0xB7,
+    0x87, 0xE5, 0xEE, 0x6B, 0xEB, 0xF2, 0xBF, 0xAF, 0xC5, 0x64, 0x07, 0x7B, 0x95, 0x9A, 0xAE, 0xB6,
+    0x12, 0x59, 0xA5, 0x35, 0x65, 0xB8, 0xA3, 0x9E, 0xD2, 0xF7, 0x62, 0x5A, 0x85, 0x7D, 0xA8, 0x3A,
+    0x29, 0x71, 0xC8, 0xF6, 0xF9, 0x43, 0xD7, 0xD6, 0x10, 0x73, 0x76, 0x78, 0x99, 0x0A, 0x19, 0x91,
+    0x14, 0x3F, 0xE6, 0xF0, 0x86, 0xB1, 0xE2, 0xF1, 0xFA, 0x74, 0xF3, 0xB4, 0x6D, 0x21, 0xB2, 0x6A,
+    0xE3, 0xE7, 0xB5, 0xEA, 0x03, 0x8F, 0xD3, 0xC9, 0x42, 0xD4, 0xE8, 0x75, 0x7F, 0xFF, 0x7E, 0xFD};
+
+TEST(TestGF28, Inv)
+{
+    GF28 gf28(0b100011101);
+    for (size_t x = 0; x < 1 << base_deg; x++)
+    {
+        EXPECT_EQ(gf28.inv(x), expect_inv_table[x]);
+    }
+}
--- a/test/test_header.cuh
+++ b/test/test_header.cuh
@ -0,0 +1,6 @@
+#ifndef TEST_HEADER_CUH
+#define TEST_HEADER_CUH
+
+#include "cuelim.cuh"
+
+#endif
--- a/test/test_matrix.cu
+++ b/test/test_matrix.cu
@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+#include "test_header.cuh"
+
+TEST(TestMatrix, Equal)
+{
+    GF28Matrix a(50, 50);
+    EXPECT_TRUE(a == base_zero);
+    GF28Matrix v = a.createView(0, 0, 30, 3);
+    EXPECT_TRUE(v == base_zero);
+    a.randomize(1243);
+    EXPECT_TRUE(a == a);
+    EXPECT_TRUE(v == v);
+    GF28Matrix b(50, 50);
+    b.randomize(1243);
+    EXPECT_FALSE(a == b);
+}
+
+TEST(TestMatrix, Xor)
+{
+    GF28Matrix a(50, 50), b(50, 50);
+    a.randomize(1243);
+    b.randomize(1243);
+    GF28Matrix c = a ^ b;
+    a ^= c;
+    EXPECT_TRUE(a == b);
+    GF28Matrix va = a.createView(20, 1, 30, 3);
+    GF28Matrix vb = b.createView(10, 2, 20, 4);
+    GF28Matrix vc = va ^ vb;
+    va ^= vc;
+    EXPECT_TRUE(va == vb);
+}
+
+// TEST(TestMatrix, Basic)
+// {
+//     GF28Matrix a(50, 50);
+//     GF28Matrix v = a.createView(0, 0, 30, 3);
+
+//     EXPECT_EQ(v.type, GF28Matrix::view);
+// }