使用命名空间进行划分

2024-09-14 16:15:13 +08:00 · 2024-09-14 16:15:13 +08:00 · e73b158a37
commit e73b158a37
parent 2375705792
15 changed files with 826 additions and 822 deletions
--- a/benchmark/bench_gf256_mul.cu
+++ b/benchmark/bench_gf256_mul.cu
@ -1,6 +1,8 @@
 #include <benchmark/benchmark.h>
 #include "cuelim.cuh"

+using namespace gf256;
+
 template <MatGF256 (*GpuFunc)(const MatGF256 &, const MatGF256 &, const GF256 &)>
 void bench_gf256_mul(benchmark::State &state)
 {
--- a/benchmark/bench_gfp_mul.cu
+++ b/benchmark/bench_gfp_mul.cu
@ -1,6 +1,8 @@
 #include <benchmark/benchmark.h>
 #include "test_header.cuh"

+using namespace gfp;
+
 static void bench_gfp(benchmark::State &state)
 {
    uint_fast32_t seed = 41921095;
--- a/include/gf256/gf256_elim.cuh
+++ b/include/gf256/gf256_elim.cuh
@ -3,6 +3,8 @@

 #include "gf256_mat.cuh"

+namespace gf256
+{
    void MatGF256::cpu_swap_row(size_t r1, size_t r2)
    {
        if (r1 == r2)
@ -19,7 +21,7 @@ void MatGF256::cpu_swap_row(size_t r1, size_t r2)
        }
    }

-size_t gf256_cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, const GF256 &gf)
+    size_t cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, const GF256 &gf)
    {
        size_t rank = 0;
        size_t pivot[gf256_num];
@ -58,7 +60,7 @@ size_t gf256_cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, s
        return rank;
    }

-__global__ void gf256_gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width)
+    __global__ void gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width)
    {
        size_t w = blockIdx.x * blockDim.x + threadIdx.x;
        if (w >= width)
@ -92,7 +94,7 @@ __global__ void gf256_gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *
        }
    }

-__global__ void gf256_gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows)
+    __global__ void gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows)
    {
        size_t w = blockIdx.x * blockDim.x + threadIdx.x;
        size_t r = blockIdx.y * blockDim.y + threadIdx.y;
@ -131,7 +133,7 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
        {
            CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault));

-        size_t src_rank = gf256_cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf);
+            size_t src_rank = cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf);

            if (src_rank == 0)
            {
@ -162,7 +164,7 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)

            dim3 block_src(THREAD_X);
            dim3 grid_src((width - w - 1) / block_src.x + 1);
-        gf256_gpu_mksrc_kernel<<<grid_src, block_src>>>(at_base(rank, w), rowstride, spL, src_rank, width);
+            gpu_mksrc_kernel<<<grid_src, block_src>>>(at_base(rank, w), rowstride, spL, src_rank, width);
            cudaDeviceSynchronize();

            dim3 block_tb(THREAD_X, THREAD_Y);
@ -174,7 +176,7 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)

            dim3 block(THREAD_X, THREAD_Y);
            dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
-        gf256_gpu_elim_kernel<<<grid, block>>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows);
+            gpu_elim_kernel<<<grid, block>>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows);
            cudaDeviceSynchronize();

            rank += src_rank;
@ -188,5 +190,6 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
        cudaFree(idx);
        return {rank, p_col, p_row};
    }
+}

 #endif
--- a/include/gf256/gf256_header.cuh
+++ b/include/gf256/gf256_header.cuh
@ -4,6 +4,8 @@
 #include "../header.cuh"
 #include <set>

+namespace gf256
+{
    using gf256_t = uint8_t;

    static const size_t gf256_len = sizeof(gf256_t) * 8;
@ -188,5 +190,5 @@ ostream &operator<<(ostream &out, const GF256 &gf)
        }
        return out;
    }
-
+}
 #endif
--- a/include/gf256/gf256_mat.cuh
+++ b/include/gf256/gf256_mat.cuh
@ -7,6 +7,8 @@
 #include <vector>
 #include <algorithm>

+namespace gf256
+{
    struct ElimResult
    {
        size_t rank;
@ -222,5 +224,6 @@ ostream &operator<<(ostream &out, const MatGF256 &m)
        }
        return out;
    }
+}

 #endif
--- a/include/gf256/gf256_mul.cuh
+++ b/include/gf256/gf256_mul.cuh
@ -3,7 +3,9 @@

 #include "gf256_mat.cuh"

-__global__ void gf256_gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows)
+namespace gf256
+{
+    __global__ void gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows)
    {
        size_t w = blockIdx.x * blockDim.x + threadIdx.x;
        size_t r = blockIdx.y * blockDim.y + threadIdx.y;
@ -40,7 +42,7 @@ __host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const G

            dim3 block(THREAD_X, THREAD_Y);
            dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
-        gf256_gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows);
+            gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows);
            cudaDeviceSynchronize();
        }
    }
@ -52,5 +54,6 @@ __host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
        c.gpu_addmul(a, b, gf);
        return c;
    }
+}

 #endif
--- a/include/gfp/gfp_header.cuh
+++ b/include/gfp/gfp_header.cuh
@ -3,25 +3,25 @@

 #include "../header.cuh"

+namespace gfp
+{
    using gfp_t = uint32_t;
-#define gfp_bits 32

-static_assert(sizeof(gfp_t) * 8 == gfp_bits);
-
-static const gfp_t gfp = 65521;
+    static const gfp_t gfprime = 65521;

    static const gfp_t gfp_zero = (gfp_t)0;
    static const gfp_t gfp_one = (gfp_t)1;
    static const gfp_t gfp_fullmask = (gfp_t)0xFF'FF;

-__managed__ gfp_t gfp_inv_table[gfp];
+    __managed__ gfp_t gfp_inv_table[gfprime];

    void init_inv_table()
    {
        gfp_inv_table[0] = 0;
        gfp_inv_table[1] = 1;
-    for (int i = 2; i < gfp; ++i)
-        gfp_inv_table[i] = (gfp - gfp / i) * gfp_inv_table[gfp % i] % gfp;
+        for (int i = 2; i < gfprime; ++i)
+            gfp_inv_table[i] = (gfprime - gfprime / i) * gfp_inv_table[gfprime % i] % gfprime;
+    }
 }

 #endif
--- a/include/gfp/gfp_mat.cuh
+++ b/include/gfp/gfp_mat.cuh
@ -7,6 +7,8 @@
 #include <vector>
 #include <algorithm>

+namespace gfp
+{
    class MatGFP
    {
    public:
@ -92,7 +94,7 @@ public:
            {
                for (size_t w = 0; w < width; w++)
                {
-                *at_base(r, w) = d(e) % gfp;
+                    *at_base(r, w) = d(e) % gfprime;
                }
            }
        }
@ -171,7 +173,7 @@ public:
            {
                for (size_t w = 0; w < width; w++)
                {
-                *at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfp;
+                    *at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfprime;
                }
            }
        }
@ -192,9 +194,9 @@ public:
                {
                    for (size_t i = 0; i < a.ncols; i++)
                    {
-                    *at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfp;
+                        *at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfprime;
                    }
-                *at_base(r, w) %= gfp;
+                    *at_base(r, w) %= gfprime;
                }
            }
        }
@ -228,15 +230,12 @@ ostream &operator<<(ostream &out, const MatGFP &m)
        {
            for (size_t w = 0; w < m.width; w++)
            {
-#if gfp_bits == 64
-            printf("%05lu ", *m.at_base(r, w));
-#else
                printf("%05u ", *m.at_base(r, w));
-#endif
            }
            printf("\n");
        }
        return out;
    }
+}

 #endif
--- a/include/gfp/gfp_mul.cuh
+++ b/include/gfp/gfp_mul.cuh
@ -3,12 +3,15 @@

 #include "gfp_mat.cuh"

+namespace gfp
+{
+
    static const int BlockRow = 128, BlockCol = 128; // 每个block处理c矩阵的一个子块
    static const int StepSize = 8;                   // block中一个循环处理的A矩阵的列数（B矩阵的行数）

    static_assert(BlockCol % THREAD_X == 0 && BlockRow % THREAD_Y == 0);

-__global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps)
+    __global__ void gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps)
    {

        const unsigned int bx = blockIdx.x;
@ -17,13 +20,8 @@ __global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp
        const unsigned int ty = threadIdx.y;
        const unsigned int tid = ty * blockDim.x + tx;

-#if gfp_bits == 64
-    __shared__ alignas(8) gfp_t s_a[StepSize][BlockRow];
-    __shared__ alignas(8) gfp_t s_b[StepSize][BlockCol];
-#else
        __shared__ gfp_t s_a[StepSize][BlockRow];
        __shared__ gfp_t s_b[StepSize][BlockCol];
-#endif

        gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0};

@ -45,27 +43,21 @@ __global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp
                {
                    for (int i = 0; i < BlockCol / THREAD_X; i++)
                    {
-#if gfp_bits == 64
-                    tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]);
-#else
-                    tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfp;
-#endif
+                        tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfprime;
                    }
                }
            }
            __syncthreads();
-#if gfp_bits != 64
            if (s & gfp_fullmask == gfp_fullmask)
            {
                for (int j = 0; j < BlockRow / THREAD_Y; j++)
                {
                    for (int i = 0; i < BlockCol / THREAD_X; i++)
                    {
-                    tmp_c[j][i] %= gfp;
+                        tmp_c[j][i] %= gfprime;
                    }
                }
            }
-#endif
        }
        for (int j = 0; j < BlockRow / THREAD_Y; j++)
        {
@ -73,7 +65,7 @@ __global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp
            {
                if (by * BlockRow + j * THREAD_Y + ty < nrows && bx * BlockCol + i * THREAD_X + tx < ncols)
                {
-                *at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfp;
+                    *at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfprime;
                }
            }
        }
@ -85,8 +77,9 @@ __host__ void MatGFP::gpu_mul(const MatGFP &a, const MatGFP &b)

        dim3 block(THREAD_X, THREAD_Y);
        dim3 grid((width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
-    gfp_gpu_mul_kernel<<<grid, block>>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width);
+        gpu_mul_kernel<<<grid, block>>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width);
        cudaDeviceSynchronize();
    }
+}

 #endif
--- a/include/header.cuh
+++ b/include/header.cuh
@ -6,14 +6,6 @@

 #include <cpp_progress.hpp>

-// matrix
-// #include <map>
-// #include <vector>
-
-// #include <algorithm>
-// #include <numeric>
-// #include <omp.h>
-
 using namespace std;

 using base_t = uint64_t;
@ -25,13 +17,8 @@ static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01;

 static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;

-static const size_t THREAD_X = 32; // 列
-static const size_t THREAD_Y = 8;  // 行
-
-// __host__ __device__ base_t *at_base(base_t *base, size_t rowstride, size_t r, size_t w)
-// {
-//     return base + r * rowstride + w;
-// }
+static const size_t THREAD_X = 16; // 列
+static const size_t THREAD_Y = 16; // 行

 template <typename T>
 __host__ __device__ T *at_base(T *base, size_t rowstride, size_t r, size_t w)
--- a/src/main.cu
+++ b/src/main.cu
@ -4,6 +4,8 @@

 #undef SHOW_PROGRESS_BAR

+using namespace gfp;
+
 int main()
 {
    int m = 1000, k = 1000, n = 1000;
--- a/test/test_gf256_elim.cu
+++ b/test/test_gf256_elim.cu
@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
 #include "test_header.cuh"

+using namespace gf256;
+
 bool test_gf256_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, const GF256 &gf256, uint_fast32_t seed)
 {
    assert(rank <= nrows && rank <= rank_col && rank_col <= ncols);
--- a/test/test_gf256_header.cu
+++ b/test/test_gf256_header.cu
@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
 #include "test_header.cuh"

+using namespace gf256;
+
 vector<gf256_t> expect_inv_table{
    0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96,
    0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A,
--- a/test/test_gf256_matrix.cu
+++ b/test/test_gf256_matrix.cu
@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
 #include "test_header.cuh"

+using namespace gf256;
+
 TEST(TestGF256Matrix, Equal)
 {
    MatGF256 a(50, 50);
--- a/test/test_gfp_mul.cu
+++ b/test/test_gfp_mul.cu
@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
 #include "test_header.cuh"

+using namespace gfp;
+
 bool test_gfp_mul(size_t m, size_t k, size_t n, uint_fast32_t seed)
 {
    MatGFP a(m, k);