完成乘法
This commit is contained in:
parent
06942fbc8e
commit
a1c14efec3
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
||||
build/
|
||||
build/
|
||||
.vscode/
|
@ -24,4 +24,4 @@ target_link_libraries(cuelim OpenMP::OpenMP_CXX) # 链接 OpenMP 库
|
||||
enable_testing() # 启动ctest测试
|
||||
add_subdirectory(test) # 添加测试目录
|
||||
|
||||
# add_subdirectory(benchmark) # 添加性能测试目录
|
||||
add_subdirectory(benchmark) # 添加性能测试目录
|
13
benchmark/CMakeLists.txt
Normal file
13
benchmark/CMakeLists.txt
Normal file
@ -0,0 +1,13 @@
|
||||
find_package(benchmark REQUIRED)
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/test)
|
||||
|
||||
set(BENCH_SRC_FILES
|
||||
"bench_mul8.cu"
|
||||
)
|
||||
|
||||
foreach(SRC ${BENCH_SRC_FILES})
|
||||
get_filename_component(SRC_NAME ${SRC} NAME_WE)
|
||||
add_executable(${SRC_NAME} ${SRC})
|
||||
target_link_libraries(${SRC_NAME} benchmark::benchmark benchmark::benchmark_main)
|
||||
endforeach()
|
19
benchmark/bench_mul8.cu
Normal file
19
benchmark/bench_mul8.cu
Normal file
@ -0,0 +1,19 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include "cuelim.cuh"
|
||||
|
||||
template <GF28Matrix (*GpuFunc)(const GF28Matrix &, const GF28Matrix &, const GF28 &)>
|
||||
void bench_mul8(benchmark::State &state)
|
||||
{
|
||||
GF28 ff(0b100011101);
|
||||
uint_fast32_t seed = 41921095;
|
||||
size_t x = state.range(0), y = state.range(1), z = state.range(2);
|
||||
GF28Matrix A(x, y), B(y, z);
|
||||
A.randomize(seed);
|
||||
B.randomize(seed);
|
||||
for (auto _ : state)
|
||||
{
|
||||
GF28Matrix C(GpuFunc(A, B, ff));
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_TEMPLATE(bench_mul8, gpu_mul)->Args({100000, 100000, 100000});
|
@ -3,15 +3,15 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cuda_runtime.h>
|
||||
// #include <fstream> // matrix
|
||||
|
||||
#include <set> // gf28
|
||||
#include <set> // gf28
|
||||
#include <random> // matrix
|
||||
// #include <map>
|
||||
// #include <vector>
|
||||
|
||||
// #include <algorithm>
|
||||
// #include <numeric>
|
||||
#include <random> // matrix
|
||||
// #include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
@ -45,7 +45,7 @@ static const size_t THREAD_Y = base_deg; // 行
|
||||
|
||||
__constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg];
|
||||
|
||||
__device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
|
||||
__host__ __device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
|
||||
{
|
||||
return base + r * pitch + w;
|
||||
}
|
||||
@ -71,8 +71,12 @@ __host__ inline void del8(base_t &dst, size_t idx)
|
||||
dst &= ~base_deg_mask[idx];
|
||||
}
|
||||
|
||||
__device__ base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
|
||||
__device__ inline base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
|
||||
{
|
||||
if (val == 0)
|
||||
{
|
||||
return base_zero;
|
||||
}
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = offset; i < base_num; i++)
|
||||
{
|
||||
@ -81,6 +85,22 @@ __device__ base_t mul_base(const gf28_t val, const base_t base, const size_t off
|
||||
return temp;
|
||||
}
|
||||
|
||||
__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_pitch, base_t *src, size_t s_pitch, size_t width, size_t nrows)
|
||||
{
|
||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (w >= width || r >= nrows)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
gf28_t val = get8(r, 0);
|
||||
base_t s = *at_pitch(src, s_pitch, get8(r, 1), w);
|
||||
base_t d = mul_base(val, s);
|
||||
*at_pitch(r_tb, tb_pitch, r, w) = d;
|
||||
}
|
||||
|
||||
__host__ inline base_t rev8(base_t n)
|
||||
{
|
||||
n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8;
|
||||
|
@ -16,14 +16,14 @@ public:
|
||||
GF28Matrix(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
|
||||
{
|
||||
width = (ncols - 1) / base_num + 1;
|
||||
pitch = ((width - 1) / 4) * 4 + 1; // 以32字节(4*64bit)为单位对齐
|
||||
pitch = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐
|
||||
CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * pitch * sizeof(base_t)));
|
||||
CUDA_CHECK(cudaMemset(data, 0, nrows * pitch * sizeof(base_t)));
|
||||
}
|
||||
// 只能拷贝构造root矩阵
|
||||
GF28Matrix(const GF28Matrix &m) : GF28Matrix(m.nrows, m.ncols)
|
||||
{
|
||||
cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault);
|
||||
CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
|
||||
}
|
||||
GF28Matrix(GF28Matrix &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), pitch(m.pitch), type(m.type), data(m.data)
|
||||
{
|
||||
@ -41,7 +41,7 @@ public:
|
||||
return *this;
|
||||
}
|
||||
assert(nrows == m.nrows && ncols == m.ncols);
|
||||
cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault);
|
||||
CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
|
||||
return *this;
|
||||
}
|
||||
GF28Matrix &operator=(GF28Matrix &&m) noexcept
|
||||
@ -111,6 +111,22 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// void write(string path) const
|
||||
// {
|
||||
// assert(type == root);
|
||||
// ofstream out(path, ios::binary);
|
||||
// out.write((char *)data, nrows * pitch * sizeof(base_t));
|
||||
// out.close();
|
||||
// }
|
||||
|
||||
// void read(string path)
|
||||
// {
|
||||
// assert(type == root);
|
||||
// ifstream in(path, ios::binary);
|
||||
// in.read((char *)data, nrows * pitch * sizeof(base_t));
|
||||
// in.close();
|
||||
// }
|
||||
|
||||
bool operator==(const GF28Matrix &m) const
|
||||
{
|
||||
if (nrows != m.nrows || ncols != m.ncols)
|
||||
@ -164,6 +180,7 @@ public:
|
||||
|
||||
friend ostream &operator<<(ostream &out, const GF28Matrix &m);
|
||||
void gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf);
|
||||
friend GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf);
|
||||
|
||||
// size_t nrows, ncols;
|
||||
// size_t width, pitch;
|
||||
|
@ -4,49 +4,52 @@
|
||||
#include "matrix.cuh"
|
||||
#include "gf28.cuh"
|
||||
|
||||
// 处理32base列的所有行
|
||||
__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *b, size_t b_pitch, base_t *c, size_t c_pitch, size_t nrows, size_t width)
|
||||
__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *tb, size_t tb_pitch, base_t *c, size_t c_pitch, size_t tb_num, size_t width, size_t nrows)
|
||||
{
|
||||
__shared__ __align__(8) base_t src[base_deg][THREAD_X];
|
||||
size_t r = threadIdx.y;
|
||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (w >= width)
|
||||
if (w >= width || r >= nrows)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (r < nrows && w < width)
|
||||
src[threadIdx.y][threadIdx.x] = *at_pitch(b, b_pitch, r, w);
|
||||
else
|
||||
src[threadIdx.y][threadIdx.x] = base_zero;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (; r < nrows; r += base_deg)
|
||||
base_t val = *at_pitch(a, a_pitch, r, 0);
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = 0; i < tb_num; i++)
|
||||
{
|
||||
base_t val = *at_pitch(a, a_pitch, r, 0);
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = 0; i < base_deg; i++)
|
||||
{
|
||||
temp ^= mul_base(get8(val, i), src[i][threadIdx.x]);
|
||||
}
|
||||
*at_pitch(c, c_pitch, r, w) ^= temp;
|
||||
temp ^= *at_pitch(tb, tb_pitch, i * (1 << base_deg) + get8(val, i), w);
|
||||
}
|
||||
*at_pitch(c, c_pitch, r, w) ^= temp;
|
||||
}
|
||||
|
||||
__host__ void GF28Matrix::gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
|
||||
{
|
||||
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
||||
cudaMemcpyToSymbol(d_mul_table, gf.mul_table, (1 << base_deg) * (1 << base_deg) * sizeof(gf28_t));
|
||||
GF28Matrix tb(base_num * (1 << base_deg), b.ncols);
|
||||
for (size_t w = 0; w < a.width; w++)
|
||||
{
|
||||
dim3 block(THREAD_X, THREAD_Y);
|
||||
dim3 grid((b.width - 1) / block.x + 1);
|
||||
size_t tb_num = min(base_num, a.ncols - w * base_num);
|
||||
|
||||
dim3 block_tb(THREAD_X, THREAD_Y);
|
||||
dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb.nrows - 1) / block_tb.y + 1);
|
||||
gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.pitch, b.at_base(w * base_num, 0), b.pitch, tb.width, tb_num * (1 << base_deg));
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
dim3 block(THREAD_X, THREAD_Y);
|
||||
dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
||||
gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.pitch, tb.data, tb.pitch, data, pitch, tb_num, width, nrows);
|
||||
cudaDeviceSynchronize();
|
||||
gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.pitch, b.at_base(w * base_num, 0), b.pitch, data, pitch, nrows, width);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__host__ GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
|
||||
{
|
||||
assert(a.ncols == b.nrows);
|
||||
GF28Matrix c(a.nrows, b.ncols);
|
||||
c.gpu_addmul(a, b, gf);
|
||||
return c;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
11
src/main.cu
11
src/main.cu
@ -2,10 +2,13 @@
|
||||
|
||||
int main()
|
||||
{
|
||||
GF28Matrix a(10000, 10000);
|
||||
GF28Matrix a(10, 10);
|
||||
a.randomize(123);
|
||||
GF28Matrix b(10000, 10000);
|
||||
b.randomize(456);
|
||||
GF28Matrix c(10000, 10000);
|
||||
GF28Matrix b(10, 10);
|
||||
b.randomize(123);
|
||||
GF28Matrix c(10, 10);
|
||||
c.gpu_addmul(a, b, GF28(0b100011101));
|
||||
cout << a << endl;
|
||||
cout << b << endl;
|
||||
cout << c << endl;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user