From a1c14efec3a28aa7ed26f3c53a00c0c9ded3ce7b Mon Sep 17 00:00:00 2001 From: shijin Date: Thu, 5 Sep 2024 23:46:07 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E4=B9=98=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++- CMakeLists.txt | 2 +- benchmark/CMakeLists.txt | 13 ++++++++++ benchmark/bench_mul8.cu | 19 ++++++++++++++ include/header.cuh | 30 ++++++++++++++++++---- include/matrix.cuh | 23 ++++++++++++++--- include/multiplication.cuh | 51 ++++++++++++++++++++------------------ src/main.cu | 11 +++++--- 8 files changed, 114 insertions(+), 38 deletions(-) create mode 100644 benchmark/CMakeLists.txt create mode 100644 benchmark/bench_mul8.cu diff --git a/.gitignore b/.gitignore index d163863..01f9cb9 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -build/ \ No newline at end of file +build/ +.vscode/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d3877f..bd6657c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,4 +24,4 @@ target_link_libraries(cuelim OpenMP::OpenMP_CXX) # 链接 OpenMP 库 enable_testing() # 启动ctest测试 add_subdirectory(test) # 添加测试目录 -# add_subdirectory(benchmark) # 添加性能测试目录 \ No newline at end of file +add_subdirectory(benchmark) # 添加性能测试目录 \ No newline at end of file diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 0000000..3e15593 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,13 @@ +find_package(benchmark REQUIRED) + +include_directories(${PROJECT_SOURCE_DIR}/test) + +set(BENCH_SRC_FILES + "bench_mul8.cu" +) + +foreach(SRC ${BENCH_SRC_FILES}) + get_filename_component(SRC_NAME ${SRC} NAME_WE) + add_executable(${SRC_NAME} ${SRC}) + target_link_libraries(${SRC_NAME} benchmark::benchmark benchmark::benchmark_main) +endforeach() \ No newline at end of file diff --git a/benchmark/bench_mul8.cu b/benchmark/bench_mul8.cu new file mode 100644 index 0000000..5e9e65c --- /dev/null +++ b/benchmark/bench_mul8.cu @@ -0,0 +1,19 @@ +#include +#include "cuelim.cuh" + +template +void bench_mul8(benchmark::State &state) +{ + GF28 ff(0b100011101); + uint_fast32_t seed = 41921095; + size_t x = state.range(0), y = state.range(1), z = state.range(2); + GF28Matrix A(x, y), B(y, z); + A.randomize(seed); + B.randomize(seed); + for (auto _ : state) + { + GF28Matrix C(GpuFunc(A, B, ff)); + } +} + +BENCHMARK_TEMPLATE(bench_mul8, gpu_mul)->Args({100000, 100000, 100000}); \ No newline at end of file diff --git a/include/header.cuh b/include/header.cuh index 68c02db..4863530 100755 --- a/include/header.cuh +++ b/include/header.cuh @@ -3,15 +3,15 @@ #include #include -#include +// #include // matrix -#include // gf28 +#include // gf28 +#include // matrix // #include // #include // #include // #include -#include // matrix // #include using namespace std; @@ -45,7 +45,7 @@ static const size_t THREAD_Y = base_deg; // 行 __constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg]; -__device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w) +__host__ __device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w) { return base + r * pitch + w; } @@ -71,8 +71,12 @@ __host__ inline void del8(base_t &dst, size_t idx) dst &= ~base_deg_mask[idx]; } -__device__ base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0) +__device__ inline base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0) { + if (val == 0) + { + return base_zero; + } base_t temp = base_zero; for (size_t i = offset; i < base_num; i++) { @@ -81,6 +85,22 @@ __device__ base_t mul_base(const gf28_t val, const base_t base, const size_t off return temp; } +__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_pitch, base_t *src, size_t s_pitch, size_t width, size_t nrows) +{ + size_t w = blockIdx.x * blockDim.x + threadIdx.x; + size_t r = blockIdx.y * blockDim.y + threadIdx.y; + + if (w >= width || r >= nrows) + { + return; + } + + gf28_t val = get8(r, 0); + base_t s = *at_pitch(src, s_pitch, get8(r, 1), w); + base_t d = mul_base(val, s); + *at_pitch(r_tb, tb_pitch, r, w) = d; +} + __host__ inline base_t rev8(base_t n) { n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8; diff --git a/include/matrix.cuh b/include/matrix.cuh index a464f35..21a5cef 100755 --- a/include/matrix.cuh +++ b/include/matrix.cuh @@ -16,14 +16,14 @@ public: GF28Matrix(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) { width = (ncols - 1) / base_num + 1; - pitch = ((width - 1) / 4) * 4 + 1; // 以32字节(4*64bit)为单位对齐 + pitch = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐 CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * pitch * sizeof(base_t))); CUDA_CHECK(cudaMemset(data, 0, nrows * pitch * sizeof(base_t))); } // 只能拷贝构造root矩阵 GF28Matrix(const GF28Matrix &m) : GF28Matrix(m.nrows, m.ncols) { - cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault); + CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); } GF28Matrix(GF28Matrix &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), pitch(m.pitch), type(m.type), data(m.data) { @@ -41,7 +41,7 @@ public: return *this; } assert(nrows == m.nrows && ncols == m.ncols); - cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault); + CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); return *this; } GF28Matrix &operator=(GF28Matrix &&m) noexcept @@ -111,6 +111,22 @@ public: } } + // void write(string path) const + // { + // assert(type == root); + // ofstream out(path, ios::binary); + // out.write((char *)data, nrows * pitch * sizeof(base_t)); + // out.close(); + // } + + // void read(string path) + // { + // assert(type == root); + // ifstream in(path, ios::binary); + // in.read((char *)data, nrows * pitch * sizeof(base_t)); + // in.close(); + // } + bool operator==(const GF28Matrix &m) const { if (nrows != m.nrows || ncols != m.ncols) @@ -164,6 +180,7 @@ public: friend ostream &operator<<(ostream &out, const GF28Matrix &m); void gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf); + friend GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf); // size_t nrows, ncols; // size_t width, pitch; diff --git a/include/multiplication.cuh b/include/multiplication.cuh index ee9d20b..211ca62 100644 --- a/include/multiplication.cuh +++ b/include/multiplication.cuh @@ -4,49 +4,52 @@ #include "matrix.cuh" #include "gf28.cuh" -// 处理32base列的所有行 -__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *b, size_t b_pitch, base_t *c, size_t c_pitch, size_t nrows, size_t width) +__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *tb, size_t tb_pitch, base_t *c, size_t c_pitch, size_t tb_num, size_t width, size_t nrows) { - __shared__ __align__(8) base_t src[base_deg][THREAD_X]; - size_t r = threadIdx.y; size_t w = blockIdx.x * blockDim.x + threadIdx.x; + size_t r = blockIdx.y * blockDim.y + threadIdx.y; - if (w >= width) + if (w >= width || r >= nrows) { return; } - if (r < nrows && w < width) - src[threadIdx.y][threadIdx.x] = *at_pitch(b, b_pitch, r, w); - else - src[threadIdx.y][threadIdx.x] = base_zero; - - __syncthreads(); - - for (; r < nrows; r += base_deg) + base_t val = *at_pitch(a, a_pitch, r, 0); + base_t temp = base_zero; + for (size_t i = 0; i < tb_num; i++) { - base_t val = *at_pitch(a, a_pitch, r, 0); - base_t temp = base_zero; - for (size_t i = 0; i < base_deg; i++) - { - temp ^= mul_base(get8(val, i), src[i][threadIdx.x]); - } - *at_pitch(c, c_pitch, r, w) ^= temp; + temp ^= *at_pitch(tb, tb_pitch, i * (1 << base_deg) + get8(val, i), w); } + *at_pitch(c, c_pitch, r, w) ^= temp; } __host__ void GF28Matrix::gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf) { assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); cudaMemcpyToSymbol(d_mul_table, gf.mul_table, (1 << base_deg) * (1 << base_deg) * sizeof(gf28_t)); + GF28Matrix tb(base_num * (1 << base_deg), b.ncols); for (size_t w = 0; w < a.width; w++) { - dim3 block(THREAD_X, THREAD_Y); - dim3 grid((b.width - 1) / block.x + 1); + size_t tb_num = min(base_num, a.ncols - w * base_num); + + dim3 block_tb(THREAD_X, THREAD_Y); + dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb.nrows - 1) / block_tb.y + 1); + gpu_mktb_kernel<<>>(tb.data, tb.pitch, b.at_base(w * base_num, 0), b.pitch, tb.width, tb_num * (1 << base_deg)); + cudaDeviceSynchronize(); + + dim3 block(THREAD_X, THREAD_Y); + dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1); + gpu_addmul_kernel<<>>(a.at_base(0, w), a.pitch, tb.data, tb.pitch, data, pitch, tb_num, width, nrows); cudaDeviceSynchronize(); - gpu_addmul_kernel<<>>(a.at_base(0, w), a.pitch, b.at_base(w * base_num, 0), b.pitch, data, pitch, nrows, width); } - cudaDeviceSynchronize(); +} + +__host__ GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf) +{ + assert(a.ncols == b.nrows); + GF28Matrix c(a.nrows, b.ncols); + c.gpu_addmul(a, b, gf); + return c; } #endif diff --git a/src/main.cu b/src/main.cu index 69827c2..06d1dc5 100644 --- a/src/main.cu +++ b/src/main.cu @@ -2,10 +2,13 @@ int main() { - GF28Matrix a(10000, 10000); + GF28Matrix a(10, 10); a.randomize(123); - GF28Matrix b(10000, 10000); - b.randomize(456); - GF28Matrix c(10000, 10000); + GF28Matrix b(10, 10); + b.randomize(123); + GF28Matrix c(10, 10); c.gpu_addmul(a, b, GF28(0b100011101)); + cout << a << endl; + cout << b << endl; + cout << c << endl; } \ No newline at end of file