diff --git a/CMakeLists.txt b/CMakeLists.txt index bd6657c..76a31ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_CUDA_ARCHITECTURES native) # 设置CUDA架构 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3") -project(cuElim_GF28 LANGUAGES CXX CUDA) # 设置项目名和语言 +project(cuElim_GF256 LANGUAGES CXX CUDA) # 设置项目名和语言 include_directories(${PROJECT_SOURCE_DIR}/include) # 添加头文件目录 diff --git a/benchmark/bench_mul8.cu b/benchmark/bench_mul8.cu index 5e9e65c..0b207ca 100644 --- a/benchmark/bench_mul8.cu +++ b/benchmark/bench_mul8.cu @@ -1,18 +1,18 @@ #include #include "cuelim.cuh" -template +template void bench_mul8(benchmark::State &state) { - GF28 ff(0b100011101); + GF256 ff(0b100011101); uint_fast32_t seed = 41921095; size_t x = state.range(0), y = state.range(1), z = state.range(2); - GF28Matrix A(x, y), B(y, z); + MatGF256 A(x, y), B(y, z); A.randomize(seed); B.randomize(seed); for (auto _ : state) { - GF28Matrix C(GpuFunc(A, B, ff)); + MatGF256 C(GpuFunc(A, B, ff)); } } diff --git a/include/cuelim.cuh b/include/cuelim.cuh index 4441db4..bd1ba52 100644 --- a/include/cuelim.cuh +++ b/include/cuelim.cuh @@ -1,6 +1,7 @@ #ifndef CUELIM_CUH #define CUELIM_CUH -#include "multiplication.cuh" +#include "gf256/gf256_mul.cuh" +#include "gf256/gf256_elim.cuh" #endif \ No newline at end of file diff --git a/include/elimination.cuh b/include/elimination.cuh deleted file mode 100644 index e69de29..0000000 diff --git a/include/gf256/gf256_elim.cuh b/include/gf256/gf256_elim.cuh new file mode 100644 index 0000000..d32a0c0 --- /dev/null +++ b/include/gf256/gf256_elim.cuh @@ -0,0 +1,11 @@ +#ifndef ELIMINATION_CUH +#define ELIMINATION_CUH + +#include "gf256_mat.cuh" + +struct ElimResult +{ + size_t rank; +}; + +#endif \ No newline at end of file diff --git a/include/gf256/gf256_header.cuh b/include/gf256/gf256_header.cuh new file mode 100755 index 0000000..8de67b5 --- /dev/null +++ b/include/gf256/gf256_header.cuh @@ -0,0 +1,162 @@ +#ifndef GF256_CUH +#define GF256_CUH + +#include "../header.cuh" +#include + +using gf256_t = uint8_t; + +static const size_t gf256_len = sizeof(gf256_t) * 8; +static const size_t gf256_num = base_len / gf256_len; + +static const gf256_t gf256_zero = (gf256_t)0x00; +static const gf256_t gf256_one = (gf256_t)0x01; + +static const base_t gf256_mask[8] = { + (base_t)0x00'00'00'00'00'00'00'FF, + (base_t)0x00'00'00'00'00'00'FF'00, + (base_t)0x00'00'00'00'00'FF'00'00, + (base_t)0x00'00'00'00'FF'00'00'00, + (base_t)0x00'00'00'FF'00'00'00'00, + (base_t)0x00'00'FF'00'00'00'00'00, + (base_t)0x00'FF'00'00'00'00'00'00, + (base_t)0xFF'00'00'00'00'00'00'00}; + +__host__ __device__ inline size_t offset8(size_t idx) +{ + return idx << 3; +} + +__host__ __device__ inline gf256_t get8(base_t src, size_t idx) +{ + return (gf256_t)(src >> offset8(idx)); +} + +// 确保set8对应位置的值为0 +__host__ __device__ inline void set8(base_t &dst, size_t idx, gf256_t src) +{ + dst |= (base_t)src << offset8(idx); +} + +__host__ inline void del8(base_t &dst, size_t idx) +{ + dst &= ~gf256_mask[idx]; +} + +__host__ inline base_t rev8(base_t n) +{ + n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8; + n = (n & 0xffff0000ffff0000ul) >> 16 | (n & 0x0000ffff0000fffful) << 16; + return n >> 32 | n << 32; +} + +__constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len]; + +__device__ inline base_t mul_base(const gf256_t val, const base_t base) +{ + if (val == 0) + { + return base_zero; + } + base_t temp = base_zero; + for (size_t i = 0; i < gf256_len; i++) + { + set8(temp, i, d_mul_table[val][get8(base, i)]); + } + return temp; +} + +static const set irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9}; + +class GF256 +{ +public: + GF256(base_t poly) + { + assert(irreducible_polynomials_degree_08.count(poly) == 1); + this->poly = poly; + for (size_t x = 0; x < (1 << gf256_len); x++) + { + mul_table[x][gf256_zero] = gf256_zero; + for (size_t d = 0; d < gf256_len; d++) + { + gf256_t val = shift_left(x, d); + for (size_t y = (1 << d); y < (1 << (d + 1)); y++) + { + mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)]; + if (mul_table[x][y] == gf256_one) + { + inv_table[x] = y; + } + } + } + } + inv_table[gf256_zero] = gf256_zero; + } + + gf256_t mul(const gf256_t x, const gf256_t y) const + { + return mul_table[x][y]; + } + + base_t mul_base(const gf256_t val, const base_t base, const size_t offset = 0) const + { + base_t temp = base_zero; + for (size_t i = offset; i < gf256_num; i++) + { + set8(temp, mul(val, get8(base, i)), i); + } + return temp; + } + + gf256_t inv(gf256_t x) + { + return inv_table[x]; + } + + inline cudaError_t cpy_to_constant() const + { + return cudaMemcpyToSymbol(d_mul_table, mul_table, (1 << gf256_len) * (1 << gf256_len) * sizeof(gf256_t)); + } + + friend ostream &operator<<(ostream &out, const GF256 &gf); + + GF256() = delete; + GF256(const GF256 &) = delete; + GF256(GF256 &&) = delete; + GF256 &operator=(const GF256 &) = delete; + GF256 &operator=(GF256 &&) = delete; + +private: + gf256_t shift_left(gf256_t x, size_t d) + { + base_t temp = (base_t)x << d; + for (size_t i = gf256_len - 1 + d; i > gf256_len - 1; i--) + { + if (temp & (1 << i)) + { + temp ^= poly << (i - gf256_len); + } + } + return temp; + } + + base_t poly; + gf256_t inv_table[1 << gf256_num]; + gf256_t mul_table[1 << gf256_num][1 << gf256_num]; +}; + +ostream &operator<<(ostream &out, const GF256 &gf) +{ + for (size_t x = 0; x < 1 << gf256_len; x++) + { + for (size_t y = 0; y < 1 << gf256_len; y++) + { + printf("%02X ", gf.mul_table[x][y]); + } + printf("\n"); + } + return out; +} + +#endif \ No newline at end of file diff --git a/include/matrix.cuh b/include/gf256/gf256_mat.cuh similarity index 55% rename from include/matrix.cuh rename to include/gf256/gf256_mat.cuh index 21a5cef..3a61ac3 100755 --- a/include/matrix.cuh +++ b/include/gf256/gf256_mat.cuh @@ -1,10 +1,11 @@ -#ifndef MATRIX_CUH -#define MATRIX_CUH +#ifndef MATGF256_CUH +#define MATGF256_CUH -#include "header.cuh" -#include "gf28.cuh" +#include "gf256_header.cuh" -class GF28Matrix +#include + +class MatGF256 { public: enum MatType @@ -13,38 +14,38 @@ public: view }; // 只能构造root矩阵 - GF28Matrix(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) + MatGF256(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) { - width = (ncols - 1) / base_num + 1; - pitch = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐 - CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * pitch * sizeof(base_t))); - CUDA_CHECK(cudaMemset(data, 0, nrows * pitch * sizeof(base_t))); + width = (ncols - 1) / gf256_num + 1; + rowstride = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐 + CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(base_t))); + CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(base_t))); } // 只能拷贝构造root矩阵 - GF28Matrix(const GF28Matrix &m) : GF28Matrix(m.nrows, m.ncols) + MatGF256(const MatGF256 &m) : MatGF256(m.nrows, m.ncols) { - CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); } - GF28Matrix(GF28Matrix &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), pitch(m.pitch), type(m.type), data(m.data) + MatGF256(MatGF256 &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data) { m.nrows = 0; m.ncols = 0; m.width = 0; - m.pitch = 0; + m.rowstride = 0; m.type = view; m.data = nullptr; } - GF28Matrix &operator=(const GF28Matrix &m) + MatGF256 &operator=(const MatGF256 &m) { if (this == &m) { return *this; } assert(nrows == m.nrows && ncols == m.ncols); - CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); return *this; } - GF28Matrix &operator=(GF28Matrix &&m) noexcept + MatGF256 &operator=(MatGF256 &&m) noexcept { if (this == &m) { @@ -57,19 +58,19 @@ public: nrows = m.nrows; ncols = m.ncols; width = m.width; - pitch = m.pitch; + rowstride = m.rowstride; type = m.type; data = m.data; m.nrows = 0; m.ncols = 0; m.width = 0; - m.pitch = 0; + m.rowstride = 0; m.type = view; m.data = nullptr; return *this; } - ~GF28Matrix() + ~MatGF256() { if (type == root) { @@ -79,18 +80,18 @@ public: inline base_t *at_base(size_t r, size_t w) const { - return data + r * pitch + w; + return data + r * rowstride + w; } // 只能以base_t为单位进行操作 - GF28Matrix createView(size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) const + MatGF256 createView(size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) const { assert(begin_ri < end_rj && end_rj <= nrows && begin_wi < end_wj && end_wj <= width); - GF28Matrix view; + MatGF256 view; view.nrows = end_rj - begin_ri; - view.ncols = (end_wj == width ? ncols : end_wj * base_num) - begin_wi * base_num; + view.ncols = (end_wj == width ? ncols : end_wj * gf256_num) - begin_wi * gf256_num; view.width = end_wj - begin_wi; - view.pitch = pitch; + view.rowstride = rowstride; view.data = at_base(begin_ri, begin_wi); return view; } @@ -100,7 +101,7 @@ public: assert(type == root); static default_random_engine e(seed); static uniform_int_distribution d; - base_t lastmask = base_fullmask >> (width * base_len - ncols * base_deg); + base_t lastmask = base_fullmask >> (width * base_len - ncols * gf256_len); for (size_t r = 0; r < nrows; r++) { for (size_t w = 0; w < width; w++) @@ -111,23 +112,7 @@ public: } } - // void write(string path) const - // { - // assert(type == root); - // ofstream out(path, ios::binary); - // out.write((char *)data, nrows * pitch * sizeof(base_t)); - // out.close(); - // } - - // void read(string path) - // { - // assert(type == root); - // ifstream in(path, ios::binary); - // in.read((char *)data, nrows * pitch * sizeof(base_t)); - // in.close(); - // } - - bool operator==(const GF28Matrix &m) const + bool operator==(const MatGF256 &m) const { if (nrows != m.nrows || ncols != m.ncols) { @@ -160,7 +145,7 @@ public: } return true; } - void operator^=(const GF28Matrix &m) + void operator^=(const MatGF256 &m) { assert(nrows == m.nrows && ncols == m.ncols); for (size_t r = 0; r < nrows; r++) @@ -171,29 +156,29 @@ public: } } } - GF28Matrix operator^(const GF28Matrix &m) const + MatGF256 operator^(const MatGF256 &m) const { - GF28Matrix temp(*this); + MatGF256 temp(*this); temp ^= m; return temp; } - friend ostream &operator<<(ostream &out, const GF28Matrix &m); - void gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf); - friend GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf); + friend ostream &operator<<(ostream &out, const MatGF256 &m); + void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); + friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); // size_t nrows, ncols; - // size_t width, pitch; + // size_t width, rowstride; private: - GF28Matrix() : nrows(0), ncols(0), width(0), pitch(0), type(view), data(nullptr) {} + MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(view), data(nullptr) {} size_t nrows, ncols; - size_t width, pitch; + size_t width, rowstride; MatType type; base_t *data; }; -ostream &operator<<(ostream &out, const GF28Matrix &m) +ostream &operator<<(ostream &out, const MatGF256 &m) { for (size_t r = 0; r < m.nrows; r++) { diff --git a/include/gf256/gf256_mul.cuh b/include/gf256/gf256_mul.cuh new file mode 100644 index 0000000..db0aacc --- /dev/null +++ b/include/gf256/gf256_mul.cuh @@ -0,0 +1,70 @@ +#ifndef MULTIPLICATION_CUH +#define MULTIPLICATION_CUH + +#include "gf256_mat.cuh" + +__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width, size_t nrows) +{ + size_t w = blockIdx.x * blockDim.x + threadIdx.x; + size_t r = blockIdx.y * blockDim.y + threadIdx.y; + + if (w >= width || r >= nrows) + { + return; + } + + gf256_t val = get8(r, 0); + base_t s = *at_base(src, s_rowstride, get8(r, 1), w); + base_t d = mul_base(val, s); + *at_base(r_tb, tb_rowstride, r, w) = d; +} + +__global__ void gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows) +{ + size_t w = blockIdx.x * blockDim.x + threadIdx.x; + size_t r = blockIdx.y * blockDim.y + threadIdx.y; + + if (w >= width || r >= nrows) + { + return; + } + + base_t val = *at_base(a, a_rowstride, r, 0); + base_t temp = base_zero; + for (size_t i = 0; i < tb_num; i++) + { + temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w); + } + *at_base(c, c_rowstride, r, w) ^= temp; +} + +__host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf) +{ + assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); + gf.cpy_to_constant(); + MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols); + for (size_t w = 0; w < a.width; w++) + { + size_t tb_num = min(gf256_num, a.ncols - w * gf256_num); + + dim3 block_tb(THREAD_X, THREAD_Y); + dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb.nrows - 1) / block_tb.y + 1); + gpu_mktb_kernel<<>>(tb.data, tb.rowstride, b.at_base(w * gf256_num, 0), b.rowstride, tb.width, tb_num * (1 << gf256_len)); + cudaDeviceSynchronize(); + + dim3 block(THREAD_X, THREAD_Y); + dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1); + gpu_addmul_kernel<<>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows); + cudaDeviceSynchronize(); + } +} + +__host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf) +{ + assert(a.ncols == b.nrows); + MatGF256 c(a.nrows, b.ncols); + c.gpu_addmul(a, b, gf); + return c; +} + +#endif diff --git a/include/gf28.cuh b/include/gf28.cuh deleted file mode 100755 index 9aef34a..0000000 --- a/include/gf28.cuh +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef GF28_CUH -#define GF28_CUH - -#include "header.cuh" - -static const set irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9}; - -class GF28 -{ -public: - GF28(base_t poly) - { - assert(irreducible_polynomials_degree_08.count(poly) == 1); - this->poly = poly; - for (size_t x = 0; x < (1 << base_deg); x++) - { - mul_table[x][gf28_zero] = gf28_zero; - for (size_t d = 0; d < base_deg; d++) - { - gf28_t val = shift_left(x, d); - for (size_t y = (1 << d); y < (1 << (d + 1)); y++) - { - mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)]; - if (mul_table[x][y] == gf28_one) - { - inv_table[x] = y; - } - } - } - } - inv_table[gf28_zero] = gf28_zero; - } - - gf28_t mul(const gf28_t x, const gf28_t y) const - { - return mul_table[x][y]; - } - - base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0) const - { - base_t temp = base_zero; - for (size_t i = offset; i < base_num; i++) - { - set8(temp, mul(val, get8(base, i)), i); - } - return temp; - } - - gf28_t inv(gf28_t x) - { - return inv_table[x]; - } - - friend ostream &operator<<(ostream &out, const GF28 &gf); - - GF28() = delete; - GF28(const GF28 &) = delete; - GF28(GF28 &&) = delete; - GF28 &operator=(const GF28 &) = delete; - GF28 &operator=(GF28 &&) = delete; - - gf28_t mul_table[1 << base_deg][1 << base_deg]; - -private: - gf28_t shift_left(gf28_t x, size_t d) - { - base_t temp = (base_t)x << d; - for (size_t i = base_deg - 1 + d; i > base_deg - 1; i--) - { - if (temp & (1 << i)) - { - temp ^= poly << (i - base_deg); - } - } - return temp; - } - - base_t poly; - gf28_t inv_table[1 << base_deg]; -}; - -ostream &operator<<(ostream &out, const GF28 &gf) -{ - for (size_t x = 0; x < 1 << base_deg; x++) - { - for (size_t y = 0; y < 1 << base_deg; y++) - { - printf("%02X ", gf.mul_table[x][y]); - } - printf("\n"); - } - return out; -} - -#endif \ No newline at end of file diff --git a/include/header.cuh b/include/header.cuh index 4863530..c61eb77 100755 --- a/include/header.cuh +++ b/include/header.cuh @@ -3,10 +3,8 @@ #include #include -// #include // matrix -#include // gf28 -#include // matrix +// matrix // #include // #include @@ -17,95 +15,20 @@ using namespace std; using base_t = uint64_t; -using gf28_t = uint8_t; -static const size_t base_deg = 8; -static const size_t base_num = 8; -static const size_t base_len = 64; -static_assert(base_len == base_deg * base_num && base_len == sizeof(base_t) * 8); +static const size_t base_len = sizeof(base_t) * 8; static const base_t base_zero = (base_t)0x00'00'00'00'00'00'00'00; static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01; -static const gf28_t gf28_zero = (gf28_t)0x00; -static const gf28_t gf28_one = (gf28_t)0x01; static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF; -static const base_t base_deg_mask[8] = { - (base_t)0x00'00'00'00'00'00'00'FF, - (base_t)0x00'00'00'00'00'00'FF'00, - (base_t)0x00'00'00'00'00'FF'00'00, - (base_t)0x00'00'00'00'FF'00'00'00, - (base_t)0x00'00'00'FF'00'00'00'00, - (base_t)0x00'00'FF'00'00'00'00'00, - (base_t)0x00'FF'00'00'00'00'00'00, - (base_t)0xFF'00'00'00'00'00'00'00}; -static const size_t THREAD_X = 32; // 列 -static const size_t THREAD_Y = base_deg; // 行 +static const size_t THREAD_X = 32; // 列 +static const size_t THREAD_Y = 8; // 行 -__constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg]; - -__host__ __device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w) +__host__ __device__ base_t *at_base(base_t *base, size_t rowstride, size_t r, size_t w) { - return base + r * pitch + w; -} - -__host__ __device__ inline size_t offset(size_t idx) -{ - return idx << 3; -} - -__host__ __device__ inline gf28_t get8(base_t src, size_t idx) -{ - return (gf28_t)(src >> offset(idx)); -} - -// 确保set8对应位置的值为0 -__host__ __device__ inline void set8(base_t &dst, size_t idx, gf28_t src) -{ - dst |= (base_t)src << offset(idx); -} - -__host__ inline void del8(base_t &dst, size_t idx) -{ - dst &= ~base_deg_mask[idx]; -} - -__device__ inline base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0) -{ - if (val == 0) - { - return base_zero; - } - base_t temp = base_zero; - for (size_t i = offset; i < base_num; i++) - { - set8(temp, i, d_mul_table[val][get8(base, i)]); - } - return temp; -} - -__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_pitch, base_t *src, size_t s_pitch, size_t width, size_t nrows) -{ - size_t w = blockIdx.x * blockDim.x + threadIdx.x; - size_t r = blockIdx.y * blockDim.y + threadIdx.y; - - if (w >= width || r >= nrows) - { - return; - } - - gf28_t val = get8(r, 0); - base_t s = *at_pitch(src, s_pitch, get8(r, 1), w); - base_t d = mul_base(val, s); - *at_pitch(r_tb, tb_pitch, r, w) = d; -} - -__host__ inline base_t rev8(base_t n) -{ - n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8; - n = (n & 0xffff0000ffff0000ul) >> 16 | (n & 0x0000ffff0000fffful) << 16; - return n >> 32 | n << 32; + return base + r * rowstride + w; } #define CUDA_CHECK(call) \ diff --git a/include/multiplication.cuh b/include/multiplication.cuh deleted file mode 100644 index 211ca62..0000000 --- a/include/multiplication.cuh +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef MULTIPLICATION_CUH -#define MULTIPLICATION_CUH - -#include "matrix.cuh" -#include "gf28.cuh" - -__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *tb, size_t tb_pitch, base_t *c, size_t c_pitch, size_t tb_num, size_t width, size_t nrows) -{ - size_t w = blockIdx.x * blockDim.x + threadIdx.x; - size_t r = blockIdx.y * blockDim.y + threadIdx.y; - - if (w >= width || r >= nrows) - { - return; - } - - base_t val = *at_pitch(a, a_pitch, r, 0); - base_t temp = base_zero; - for (size_t i = 0; i < tb_num; i++) - { - temp ^= *at_pitch(tb, tb_pitch, i * (1 << base_deg) + get8(val, i), w); - } - *at_pitch(c, c_pitch, r, w) ^= temp; -} - -__host__ void GF28Matrix::gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf) -{ - assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); - cudaMemcpyToSymbol(d_mul_table, gf.mul_table, (1 << base_deg) * (1 << base_deg) * sizeof(gf28_t)); - GF28Matrix tb(base_num * (1 << base_deg), b.ncols); - for (size_t w = 0; w < a.width; w++) - { - size_t tb_num = min(base_num, a.ncols - w * base_num); - - dim3 block_tb(THREAD_X, THREAD_Y); - dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb.nrows - 1) / block_tb.y + 1); - gpu_mktb_kernel<<>>(tb.data, tb.pitch, b.at_base(w * base_num, 0), b.pitch, tb.width, tb_num * (1 << base_deg)); - cudaDeviceSynchronize(); - - dim3 block(THREAD_X, THREAD_Y); - dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1); - gpu_addmul_kernel<<>>(a.at_base(0, w), a.pitch, tb.data, tb.pitch, data, pitch, tb_num, width, nrows); - cudaDeviceSynchronize(); - } -} - -__host__ GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf) -{ - assert(a.ncols == b.nrows); - GF28Matrix c(a.nrows, b.ncols); - c.gpu_addmul(a, b, gf); - return c; -} - -#endif diff --git a/src/main.cu b/src/main.cu index 06d1dc5..0ba27bd 100644 --- a/src/main.cu +++ b/src/main.cu @@ -2,12 +2,12 @@ int main() { - GF28Matrix a(10, 10); + MatGF256 a(10, 10); a.randomize(123); - GF28Matrix b(10, 10); + MatGF256 b(10, 10); b.randomize(123); - GF28Matrix c(10, 10); - c.gpu_addmul(a, b, GF28(0b100011101)); + MatGF256 c(10, 10); + c.gpu_addmul(a, b, GF256(0b100011101)); cout << a << endl; cout << b << endl; cout << c << endl; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 312d18b..eb161b2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,7 +3,7 @@ find_package(GTest REQUIRED) # 查找GTest库 include_directories(${PROJECT_SOURCE_DIR}/test) # 添加测试头文件目录 set(TEST_SRC_FILES - "test_gf28.cu" + "test_gf256.cu" "test_matrix.cu" ) diff --git a/test/test_gf28.cu b/test/test_gf256.cu similarity index 89% rename from test/test_gf28.cu rename to test/test_gf256.cu index cef1806..a774a45 100644 --- a/test/test_gf28.cu +++ b/test/test_gf256.cu @@ -1,7 +1,7 @@ #include #include "test_header.cuh" -vector expect_inv_table{ +vector expect_inv_table{ 0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96, 0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A, 0x6C, 0xED, 0x39, 0x51, 0x60, 0x56, 0x2C, 0x8A, 0x70, 0xD0, 0x1F, 0x4A, 0x26, 0x8B, 0x33, 0x6E, @@ -19,11 +19,11 @@ vector expect_inv_table{ 0x14, 0x3F, 0xE6, 0xF0, 0x86, 0xB1, 0xE2, 0xF1, 0xFA, 0x74, 0xF3, 0xB4, 0x6D, 0x21, 0xB2, 0x6A, 0xE3, 0xE7, 0xB5, 0xEA, 0x03, 0x8F, 0xD3, 0xC9, 0x42, 0xD4, 0xE8, 0x75, 0x7F, 0xFF, 0x7E, 0xFD}; -TEST(TestGF28, Inv) +TEST(TestGF256, Inv) { - GF28 gf28(0b100011101); - for (size_t x = 0; x < 1 << base_deg; x++) + GF256 gf256(0b100011101); + for (size_t x = 0; x < 1 << gf256_len; x++) { - EXPECT_EQ(gf28.inv(x), expect_inv_table[x]); + EXPECT_EQ(gf256.inv(x), expect_inv_table[x]); } } \ No newline at end of file diff --git a/test/test_matrix.cu b/test/test_matrix.cu index bd9144b..5ac2fd8 100644 --- a/test/test_matrix.cu +++ b/test/test_matrix.cu @@ -3,37 +3,37 @@ TEST(TestMatrix, Equal) { - GF28Matrix a(50, 50); + MatGF256 a(50, 50); EXPECT_TRUE(a == base_zero); - GF28Matrix v = a.createView(0, 0, 30, 3); + MatGF256 v = a.createView(0, 0, 30, 3); EXPECT_TRUE(v == base_zero); a.randomize(1243); EXPECT_TRUE(a == a); EXPECT_TRUE(v == v); - GF28Matrix b(50, 50); + MatGF256 b(50, 50); b.randomize(1243); EXPECT_FALSE(a == b); } TEST(TestMatrix, Xor) { - GF28Matrix a(50, 50), b(50, 50); + MatGF256 a(50, 50), b(50, 50); a.randomize(1243); b.randomize(1243); - GF28Matrix c = a ^ b; + MatGF256 c = a ^ b; a ^= c; EXPECT_TRUE(a == b); - GF28Matrix va = a.createView(20, 1, 30, 3); - GF28Matrix vb = b.createView(10, 2, 20, 4); - GF28Matrix vc = va ^ vb; + MatGF256 va = a.createView(20, 1, 30, 3); + MatGF256 vb = b.createView(10, 2, 20, 4); + MatGF256 vc = va ^ vb; va ^= vc; EXPECT_TRUE(va == vb); } // TEST(TestMatrix, Basic) // { -// GF28Matrix a(50, 50); -// GF28Matrix v = a.createView(0, 0, 30, 3); +// MatGF256 a(50, 50); +// MatGF256 v = a.createView(0, 0, 30, 3); -// EXPECT_EQ(v.type, GF28Matrix::view); +// EXPECT_EQ(v.type, MatGF256::view); // } \ No newline at end of file