From a1c14efec3a28aa7ed26f3c53a00c0c9ded3ce7b Mon Sep 17 00:00:00 2001
From: shijin <sj_shijin@outlook.com>
Date: Thu, 5 Sep 2024 23:46:07 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=B9=98=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                 |  3 ++-
 CMakeLists.txt             |  2 +-
 benchmark/CMakeLists.txt   | 13 ++++++++++
 benchmark/bench_mul8.cu    | 19 ++++++++++++++
 include/header.cuh         | 30 ++++++++++++++++++----
 include/matrix.cuh         | 23 ++++++++++++++---
 include/multiplication.cuh | 51 ++++++++++++++++++++------------------
 src/main.cu                | 11 +++++---
 8 files changed, 114 insertions(+), 38 deletions(-)
 create mode 100644 benchmark/CMakeLists.txt
 create mode 100644 benchmark/bench_mul8.cu
diff --git a/.gitignore b/.gitignore
index d163863..01f9cb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-build/
\ No newline at end of file
+build/
+.vscode/
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d3877f..bd6657c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,4 +24,4 @@ target_link_libraries(cuelim OpenMP::OpenMP_CXX)  # 链接 OpenMP 库
 enable_testing()  # 启动ctest测试
 add_subdirectory(test)  # 添加测试目录
 
-# add_subdirectory(benchmark)  # 添加性能测试目录
\ No newline at end of file
+add_subdirectory(benchmark)  # 添加性能测试目录
\ No newline at end of file
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..3e15593
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,13 @@
+find_package(benchmark REQUIRED)
+
+include_directories(${PROJECT_SOURCE_DIR}/test)
+
+set(BENCH_SRC_FILES
+    "bench_mul8.cu"
+)
+
+foreach(SRC ${BENCH_SRC_FILES})
+    get_filename_component(SRC_NAME ${SRC} NAME_WE)
+    add_executable(${SRC_NAME} ${SRC})
+    target_link_libraries(${SRC_NAME} benchmark::benchmark benchmark::benchmark_main)
+endforeach()
\ No newline at end of file
diff --git a/benchmark/bench_mul8.cu b/benchmark/bench_mul8.cu
new file mode 100644
index 0000000..5e9e65c
--- /dev/null
+++ b/benchmark/bench_mul8.cu
@@ -0,0 +1,19 @@
+#include <benchmark/benchmark.h>
+#include "cuelim.cuh"
+
+template <GF28Matrix (*GpuFunc)(const GF28Matrix &, const GF28Matrix &, const GF28 &)>
+void bench_mul8(benchmark::State &state)
+{
+    GF28 ff(0b100011101);
+    uint_fast32_t seed = 41921095;
+    size_t x = state.range(0), y = state.range(1), z = state.range(2);
+    GF28Matrix A(x, y), B(y, z);
+    A.randomize(seed);
+    B.randomize(seed);
+    for (auto _ : state)
+    {
+        GF28Matrix C(GpuFunc(A, B, ff));
+    }
+}
+
+BENCHMARK_TEMPLATE(bench_mul8, gpu_mul)->Args({100000, 100000, 100000});
\ No newline at end of file
diff --git a/include/header.cuh b/include/header.cuh
index 68c02db..4863530 100755
--- a/include/header.cuh
+++ b/include/header.cuh
@@ -3,15 +3,15 @@
 
 #include <iostream>
 #include <cassert>
-#include <cuda_runtime.h>
+// #include <fstream> // matrix
 
-#include <set> // gf28
+#include <set>    // gf28
+#include <random> // matrix
 // #include <map>
 // #include <vector>
 
 // #include <algorithm>
 // #include <numeric>
-#include <random> // matrix
 // #include <omp.h>
 
 using namespace std;
@@ -45,7 +45,7 @@ static const size_t THREAD_Y = base_deg; // 行
 
 __constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg];
 
-__device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
+__host__ __device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
 {
     return base + r * pitch + w;
 }
@@ -71,8 +71,12 @@ __host__ inline void del8(base_t &dst, size_t idx)
     dst &= ~base_deg_mask[idx];
 }
 
-__device__ base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
+__device__ inline base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
 {
+    if (val == 0)
+    {
+        return base_zero;
+    }
     base_t temp = base_zero;
     for (size_t i = offset; i < base_num; i++)
     {
@@ -81,6 +85,22 @@ __device__ base_t mul_base(const gf28_t val, const base_t base, const size_t off
     return temp;
 }
 
+__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_pitch, base_t *src, size_t s_pitch, size_t width, size_t nrows)
+{
+    size_t w = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t r = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (w >= width || r >= nrows)
+    {
+        return;
+    }
+
+    gf28_t val = get8(r, 0);
+    base_t s = *at_pitch(src, s_pitch, get8(r, 1), w);
+    base_t d = mul_base(val, s);
+    *at_pitch(r_tb, tb_pitch, r, w) = d;
+}
+
 __host__ inline base_t rev8(base_t n)
 {
     n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8;
diff --git a/include/matrix.cuh b/include/matrix.cuh
index a464f35..21a5cef 100755
--- a/include/matrix.cuh
+++ b/include/matrix.cuh
@@ -16,14 +16,14 @@ public:
     GF28Matrix(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
     {
         width = (ncols - 1) / base_num + 1;
-        pitch = ((width - 1) / 4) * 4 + 1; // 以32字节（4*64bit）为单位对齐
+        pitch = ((width - 1) / 4 + 1) * 4; // 以32字节（4*64bit）为单位对齐
         CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * pitch * sizeof(base_t)));
         CUDA_CHECK(cudaMemset(data, 0, nrows * pitch * sizeof(base_t)));
     }
     // 只能拷贝构造root矩阵
     GF28Matrix(const GF28Matrix &m) : GF28Matrix(m.nrows, m.ncols)
     {
-        cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault);
+        CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
     }
     GF28Matrix(GF28Matrix &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), pitch(m.pitch), type(m.type), data(m.data)
     {
@@ -41,7 +41,7 @@ public:
             return *this;
         }
         assert(nrows == m.nrows && ncols == m.ncols);
-        cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault);
+        CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
         return *this;
     }
     GF28Matrix &operator=(GF28Matrix &&m) noexcept
@@ -111,6 +111,22 @@ public:
         }
     }
 
+    // void write(string path) const
+    // {
+    //     assert(type == root);
+    //     ofstream out(path, ios::binary);
+    //     out.write((char *)data, nrows * pitch * sizeof(base_t));
+    //     out.close();
+    // }
+
+    // void read(string path)
+    // {
+    //     assert(type == root);
+    //     ifstream in(path, ios::binary);
+    //     in.read((char *)data, nrows * pitch * sizeof(base_t));
+    //     in.close();
+    // }
+
     bool operator==(const GF28Matrix &m) const
     {
         if (nrows != m.nrows || ncols != m.ncols)
@@ -164,6 +180,7 @@ public:
 
     friend ostream &operator<<(ostream &out, const GF28Matrix &m);
     void gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf);
+    friend GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf);
 
     // size_t nrows, ncols;
     // size_t width, pitch;
diff --git a/include/multiplication.cuh b/include/multiplication.cuh
index ee9d20b..211ca62 100644
--- a/include/multiplication.cuh
+++ b/include/multiplication.cuh
@@ -4,49 +4,52 @@
 #include "matrix.cuh"
 #include "gf28.cuh"
 
-// 处理32base列的所有行
-__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *b, size_t b_pitch, base_t *c, size_t c_pitch, size_t nrows, size_t width)
+__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *tb, size_t tb_pitch, base_t *c, size_t c_pitch, size_t tb_num, size_t width, size_t nrows)
 {
-    __shared__ __align__(8) base_t src[base_deg][THREAD_X];
-    size_t r = threadIdx.y;
     size_t w = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t r = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (w >= width)
+    if (w >= width || r >= nrows)
     {
         return;
     }
 
-    if (r < nrows && w < width)
-        src[threadIdx.y][threadIdx.x] = *at_pitch(b, b_pitch, r, w);
-    else
-        src[threadIdx.y][threadIdx.x] = base_zero;
-
-    __syncthreads();
-
-    for (; r < nrows; r += base_deg)
+    base_t val = *at_pitch(a, a_pitch, r, 0);
+    base_t temp = base_zero;
+    for (size_t i = 0; i < tb_num; i++)
     {
-        base_t val = *at_pitch(a, a_pitch, r, 0);
-        base_t temp = base_zero;
-        for (size_t i = 0; i < base_deg; i++)
-        {
-            temp ^= mul_base(get8(val, i), src[i][threadIdx.x]);
-        }
-        *at_pitch(c, c_pitch, r, w) ^= temp;
+        temp ^= *at_pitch(tb, tb_pitch, i * (1 << base_deg) + get8(val, i), w);
     }
+    *at_pitch(c, c_pitch, r, w) ^= temp;
 }
 
 __host__ void GF28Matrix::gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
 {
     assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
     cudaMemcpyToSymbol(d_mul_table, gf.mul_table, (1 << base_deg) * (1 << base_deg) * sizeof(gf28_t));
+    GF28Matrix tb(base_num * (1 << base_deg), b.ncols);
     for (size_t w = 0; w < a.width; w++)
     {
-        dim3 block(THREAD_X, THREAD_Y);
-        dim3 grid((b.width - 1) / block.x + 1);
+        size_t tb_num = min(base_num, a.ncols - w * base_num);
+
+        dim3 block_tb(THREAD_X, THREAD_Y);
+        dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb.nrows - 1) / block_tb.y + 1);
+        gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.pitch, b.at_base(w * base_num, 0), b.pitch, tb.width, tb_num * (1 << base_deg));
+        cudaDeviceSynchronize();
+
+        dim3 block(THREAD_X, THREAD_Y);
+        dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
+        gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.pitch, tb.data, tb.pitch, data, pitch, tb_num, width, nrows);
         cudaDeviceSynchronize();
-        gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.pitch, b.at_base(w * base_num, 0), b.pitch, data, pitch, nrows, width);
     }
-    cudaDeviceSynchronize();
+}
+
+__host__ GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
+{
+    assert(a.ncols == b.nrows);
+    GF28Matrix c(a.nrows, b.ncols);
+    c.gpu_addmul(a, b, gf);
+    return c;
 }
 
 #endif
diff --git a/src/main.cu b/src/main.cu
index 69827c2..06d1dc5 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -2,10 +2,13 @@
 
 int main()
 {
-    GF28Matrix a(10000, 10000);
+    GF28Matrix a(10, 10);
     a.randomize(123);
-    GF28Matrix b(10000, 10000);
-    b.randomize(456);
-    GF28Matrix c(10000, 10000);
+    GF28Matrix b(10, 10);
+    b.randomize(123);
+    GF28Matrix c(10, 10);
     c.gpu_addmul(a, b, GF28(0b100011101));
+    cout << a << endl;
+    cout << b << endl;
+    cout << c << endl;
 }
\ No newline at end of file