修改了文件结构
This commit is contained in:
parent
a1c14efec3
commit
5dc97e0039
@ -12,7 +12,7 @@ set(CMAKE_CUDA_ARCHITECTURES native) # 设置CUDA架构
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
|
||||
|
||||
project(cuElim_GF28 LANGUAGES CXX CUDA) # 设置项目名和语言
|
||||
project(cuElim_GF256 LANGUAGES CXX CUDA) # 设置项目名和语言
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include) # 添加头文件目录
|
||||
|
||||
|
@ -1,18 +1,18 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include "cuelim.cuh"
|
||||
|
||||
template <GF28Matrix (*GpuFunc)(const GF28Matrix &, const GF28Matrix &, const GF28 &)>
|
||||
template <MatGF256 (*GpuFunc)(const MatGF256 &, const MatGF256 &, const GF256 &)>
|
||||
void bench_mul8(benchmark::State &state)
|
||||
{
|
||||
GF28 ff(0b100011101);
|
||||
GF256 ff(0b100011101);
|
||||
uint_fast32_t seed = 41921095;
|
||||
size_t x = state.range(0), y = state.range(1), z = state.range(2);
|
||||
GF28Matrix A(x, y), B(y, z);
|
||||
MatGF256 A(x, y), B(y, z);
|
||||
A.randomize(seed);
|
||||
B.randomize(seed);
|
||||
for (auto _ : state)
|
||||
{
|
||||
GF28Matrix C(GpuFunc(A, B, ff));
|
||||
MatGF256 C(GpuFunc(A, B, ff));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#ifndef CUELIM_CUH
|
||||
#define CUELIM_CUH
|
||||
|
||||
#include "multiplication.cuh"
|
||||
#include "gf256/gf256_mul.cuh"
|
||||
#include "gf256/gf256_elim.cuh"
|
||||
|
||||
#endif
|
11
include/gf256/gf256_elim.cuh
Normal file
11
include/gf256/gf256_elim.cuh
Normal file
@ -0,0 +1,11 @@
|
||||
#ifndef ELIMINATION_CUH
|
||||
#define ELIMINATION_CUH
|
||||
|
||||
#include "gf256_mat.cuh"
|
||||
|
||||
struct ElimResult
|
||||
{
|
||||
size_t rank;
|
||||
};
|
||||
|
||||
#endif
|
162
include/gf256/gf256_header.cuh
Executable file
162
include/gf256/gf256_header.cuh
Executable file
@ -0,0 +1,162 @@
|
||||
#ifndef GF256_CUH
|
||||
#define GF256_CUH
|
||||
|
||||
#include "../header.cuh"
|
||||
#include <set>
|
||||
|
||||
using gf256_t = uint8_t;
|
||||
|
||||
static const size_t gf256_len = sizeof(gf256_t) * 8;
|
||||
static const size_t gf256_num = base_len / gf256_len;
|
||||
|
||||
static const gf256_t gf256_zero = (gf256_t)0x00;
|
||||
static const gf256_t gf256_one = (gf256_t)0x01;
|
||||
|
||||
static const base_t gf256_mask[8] = {
|
||||
(base_t)0x00'00'00'00'00'00'00'FF,
|
||||
(base_t)0x00'00'00'00'00'00'FF'00,
|
||||
(base_t)0x00'00'00'00'00'FF'00'00,
|
||||
(base_t)0x00'00'00'00'FF'00'00'00,
|
||||
(base_t)0x00'00'00'FF'00'00'00'00,
|
||||
(base_t)0x00'00'FF'00'00'00'00'00,
|
||||
(base_t)0x00'FF'00'00'00'00'00'00,
|
||||
(base_t)0xFF'00'00'00'00'00'00'00};
|
||||
|
||||
__host__ __device__ inline size_t offset8(size_t idx)
|
||||
{
|
||||
return idx << 3;
|
||||
}
|
||||
|
||||
__host__ __device__ inline gf256_t get8(base_t src, size_t idx)
|
||||
{
|
||||
return (gf256_t)(src >> offset8(idx));
|
||||
}
|
||||
|
||||
// 确保set8对应位置的值为0
|
||||
__host__ __device__ inline void set8(base_t &dst, size_t idx, gf256_t src)
|
||||
{
|
||||
dst |= (base_t)src << offset8(idx);
|
||||
}
|
||||
|
||||
__host__ inline void del8(base_t &dst, size_t idx)
|
||||
{
|
||||
dst &= ~gf256_mask[idx];
|
||||
}
|
||||
|
||||
__host__ inline base_t rev8(base_t n)
|
||||
{
|
||||
n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8;
|
||||
n = (n & 0xffff0000ffff0000ul) >> 16 | (n & 0x0000ffff0000fffful) << 16;
|
||||
return n >> 32 | n << 32;
|
||||
}
|
||||
|
||||
__constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len];
|
||||
|
||||
__device__ inline base_t mul_base(const gf256_t val, const base_t base)
|
||||
{
|
||||
if (val == 0)
|
||||
{
|
||||
return base_zero;
|
||||
}
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = 0; i < gf256_len; i++)
|
||||
{
|
||||
set8(temp, i, d_mul_table[val][get8(base, i)]);
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
static const set<base_t> irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9};
|
||||
|
||||
class GF256
|
||||
{
|
||||
public:
|
||||
GF256(base_t poly)
|
||||
{
|
||||
assert(irreducible_polynomials_degree_08.count(poly) == 1);
|
||||
this->poly = poly;
|
||||
for (size_t x = 0; x < (1 << gf256_len); x++)
|
||||
{
|
||||
mul_table[x][gf256_zero] = gf256_zero;
|
||||
for (size_t d = 0; d < gf256_len; d++)
|
||||
{
|
||||
gf256_t val = shift_left(x, d);
|
||||
for (size_t y = (1 << d); y < (1 << (d + 1)); y++)
|
||||
{
|
||||
mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)];
|
||||
if (mul_table[x][y] == gf256_one)
|
||||
{
|
||||
inv_table[x] = y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
inv_table[gf256_zero] = gf256_zero;
|
||||
}
|
||||
|
||||
gf256_t mul(const gf256_t x, const gf256_t y) const
|
||||
{
|
||||
return mul_table[x][y];
|
||||
}
|
||||
|
||||
base_t mul_base(const gf256_t val, const base_t base, const size_t offset = 0) const
|
||||
{
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = offset; i < gf256_num; i++)
|
||||
{
|
||||
set8(temp, mul(val, get8(base, i)), i);
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
gf256_t inv(gf256_t x)
|
||||
{
|
||||
return inv_table[x];
|
||||
}
|
||||
|
||||
inline cudaError_t cpy_to_constant() const
|
||||
{
|
||||
return cudaMemcpyToSymbol(d_mul_table, mul_table, (1 << gf256_len) * (1 << gf256_len) * sizeof(gf256_t));
|
||||
}
|
||||
|
||||
friend ostream &operator<<(ostream &out, const GF256 &gf);
|
||||
|
||||
GF256() = delete;
|
||||
GF256(const GF256 &) = delete;
|
||||
GF256(GF256 &&) = delete;
|
||||
GF256 &operator=(const GF256 &) = delete;
|
||||
GF256 &operator=(GF256 &&) = delete;
|
||||
|
||||
private:
|
||||
gf256_t shift_left(gf256_t x, size_t d)
|
||||
{
|
||||
base_t temp = (base_t)x << d;
|
||||
for (size_t i = gf256_len - 1 + d; i > gf256_len - 1; i--)
|
||||
{
|
||||
if (temp & (1 << i))
|
||||
{
|
||||
temp ^= poly << (i - gf256_len);
|
||||
}
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
base_t poly;
|
||||
gf256_t inv_table[1 << gf256_num];
|
||||
gf256_t mul_table[1 << gf256_num][1 << gf256_num];
|
||||
};
|
||||
|
||||
ostream &operator<<(ostream &out, const GF256 &gf)
|
||||
{
|
||||
for (size_t x = 0; x < 1 << gf256_len; x++)
|
||||
{
|
||||
for (size_t y = 0; y < 1 << gf256_len; y++)
|
||||
{
|
||||
printf("%02X ", gf.mul_table[x][y]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
#endif
|
@ -1,10 +1,11 @@
|
||||
#ifndef MATRIX_CUH
|
||||
#define MATRIX_CUH
|
||||
#ifndef MATGF256_CUH
|
||||
#define MATGF256_CUH
|
||||
|
||||
#include "header.cuh"
|
||||
#include "gf28.cuh"
|
||||
#include "gf256_header.cuh"
|
||||
|
||||
class GF28Matrix
|
||||
#include <random>
|
||||
|
||||
class MatGF256
|
||||
{
|
||||
public:
|
||||
enum MatType
|
||||
@ -13,38 +14,38 @@ public:
|
||||
view
|
||||
};
|
||||
// 只能构造root矩阵
|
||||
GF28Matrix(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
|
||||
MatGF256(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
|
||||
{
|
||||
width = (ncols - 1) / base_num + 1;
|
||||
pitch = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐
|
||||
CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * pitch * sizeof(base_t)));
|
||||
CUDA_CHECK(cudaMemset(data, 0, nrows * pitch * sizeof(base_t)));
|
||||
width = (ncols - 1) / gf256_num + 1;
|
||||
rowstride = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐
|
||||
CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(base_t)));
|
||||
CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(base_t)));
|
||||
}
|
||||
// 只能拷贝构造root矩阵
|
||||
GF28Matrix(const GF28Matrix &m) : GF28Matrix(m.nrows, m.ncols)
|
||||
MatGF256(const MatGF256 &m) : MatGF256(m.nrows, m.ncols)
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
|
||||
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
|
||||
}
|
||||
GF28Matrix(GF28Matrix &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), pitch(m.pitch), type(m.type), data(m.data)
|
||||
MatGF256(MatGF256 &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data)
|
||||
{
|
||||
m.nrows = 0;
|
||||
m.ncols = 0;
|
||||
m.width = 0;
|
||||
m.pitch = 0;
|
||||
m.rowstride = 0;
|
||||
m.type = view;
|
||||
m.data = nullptr;
|
||||
}
|
||||
GF28Matrix &operator=(const GF28Matrix &m)
|
||||
MatGF256 &operator=(const MatGF256 &m)
|
||||
{
|
||||
if (this == &m)
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
assert(nrows == m.nrows && ncols == m.ncols);
|
||||
CUDA_CHECK(cudaMemcpy2D(data, pitch * sizeof(base_t), m.data, m.pitch * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
|
||||
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
|
||||
return *this;
|
||||
}
|
||||
GF28Matrix &operator=(GF28Matrix &&m) noexcept
|
||||
MatGF256 &operator=(MatGF256 &&m) noexcept
|
||||
{
|
||||
if (this == &m)
|
||||
{
|
||||
@ -57,19 +58,19 @@ public:
|
||||
nrows = m.nrows;
|
||||
ncols = m.ncols;
|
||||
width = m.width;
|
||||
pitch = m.pitch;
|
||||
rowstride = m.rowstride;
|
||||
type = m.type;
|
||||
data = m.data;
|
||||
m.nrows = 0;
|
||||
m.ncols = 0;
|
||||
m.width = 0;
|
||||
m.pitch = 0;
|
||||
m.rowstride = 0;
|
||||
m.type = view;
|
||||
m.data = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
~GF28Matrix()
|
||||
~MatGF256()
|
||||
{
|
||||
if (type == root)
|
||||
{
|
||||
@ -79,18 +80,18 @@ public:
|
||||
|
||||
inline base_t *at_base(size_t r, size_t w) const
|
||||
{
|
||||
return data + r * pitch + w;
|
||||
return data + r * rowstride + w;
|
||||
}
|
||||
|
||||
// 只能以base_t为单位进行操作
|
||||
GF28Matrix createView(size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) const
|
||||
MatGF256 createView(size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) const
|
||||
{
|
||||
assert(begin_ri < end_rj && end_rj <= nrows && begin_wi < end_wj && end_wj <= width);
|
||||
GF28Matrix view;
|
||||
MatGF256 view;
|
||||
view.nrows = end_rj - begin_ri;
|
||||
view.ncols = (end_wj == width ? ncols : end_wj * base_num) - begin_wi * base_num;
|
||||
view.ncols = (end_wj == width ? ncols : end_wj * gf256_num) - begin_wi * gf256_num;
|
||||
view.width = end_wj - begin_wi;
|
||||
view.pitch = pitch;
|
||||
view.rowstride = rowstride;
|
||||
view.data = at_base(begin_ri, begin_wi);
|
||||
return view;
|
||||
}
|
||||
@ -100,7 +101,7 @@ public:
|
||||
assert(type == root);
|
||||
static default_random_engine e(seed);
|
||||
static uniform_int_distribution<base_t> d;
|
||||
base_t lastmask = base_fullmask >> (width * base_len - ncols * base_deg);
|
||||
base_t lastmask = base_fullmask >> (width * base_len - ncols * gf256_len);
|
||||
for (size_t r = 0; r < nrows; r++)
|
||||
{
|
||||
for (size_t w = 0; w < width; w++)
|
||||
@ -111,23 +112,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// void write(string path) const
|
||||
// {
|
||||
// assert(type == root);
|
||||
// ofstream out(path, ios::binary);
|
||||
// out.write((char *)data, nrows * pitch * sizeof(base_t));
|
||||
// out.close();
|
||||
// }
|
||||
|
||||
// void read(string path)
|
||||
// {
|
||||
// assert(type == root);
|
||||
// ifstream in(path, ios::binary);
|
||||
// in.read((char *)data, nrows * pitch * sizeof(base_t));
|
||||
// in.close();
|
||||
// }
|
||||
|
||||
bool operator==(const GF28Matrix &m) const
|
||||
bool operator==(const MatGF256 &m) const
|
||||
{
|
||||
if (nrows != m.nrows || ncols != m.ncols)
|
||||
{
|
||||
@ -160,7 +145,7 @@ public:
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void operator^=(const GF28Matrix &m)
|
||||
void operator^=(const MatGF256 &m)
|
||||
{
|
||||
assert(nrows == m.nrows && ncols == m.ncols);
|
||||
for (size_t r = 0; r < nrows; r++)
|
||||
@ -171,29 +156,29 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
GF28Matrix operator^(const GF28Matrix &m) const
|
||||
MatGF256 operator^(const MatGF256 &m) const
|
||||
{
|
||||
GF28Matrix temp(*this);
|
||||
MatGF256 temp(*this);
|
||||
temp ^= m;
|
||||
return temp;
|
||||
}
|
||||
|
||||
friend ostream &operator<<(ostream &out, const GF28Matrix &m);
|
||||
void gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf);
|
||||
friend GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf);
|
||||
friend ostream &operator<<(ostream &out, const MatGF256 &m);
|
||||
void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
|
||||
friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
|
||||
|
||||
// size_t nrows, ncols;
|
||||
// size_t width, pitch;
|
||||
// size_t width, rowstride;
|
||||
|
||||
private:
|
||||
GF28Matrix() : nrows(0), ncols(0), width(0), pitch(0), type(view), data(nullptr) {}
|
||||
MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(view), data(nullptr) {}
|
||||
size_t nrows, ncols;
|
||||
size_t width, pitch;
|
||||
size_t width, rowstride;
|
||||
MatType type;
|
||||
base_t *data;
|
||||
};
|
||||
|
||||
ostream &operator<<(ostream &out, const GF28Matrix &m)
|
||||
ostream &operator<<(ostream &out, const MatGF256 &m)
|
||||
{
|
||||
for (size_t r = 0; r < m.nrows; r++)
|
||||
{
|
70
include/gf256/gf256_mul.cuh
Normal file
70
include/gf256/gf256_mul.cuh
Normal file
@ -0,0 +1,70 @@
|
||||
#ifndef MULTIPLICATION_CUH
|
||||
#define MULTIPLICATION_CUH
|
||||
|
||||
#include "gf256_mat.cuh"
|
||||
|
||||
__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width, size_t nrows)
|
||||
{
|
||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (w >= width || r >= nrows)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
gf256_t val = get8(r, 0);
|
||||
base_t s = *at_base(src, s_rowstride, get8(r, 1), w);
|
||||
base_t d = mul_base(val, s);
|
||||
*at_base(r_tb, tb_rowstride, r, w) = d;
|
||||
}
|
||||
|
||||
__global__ void gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows)
|
||||
{
|
||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (w >= width || r >= nrows)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
base_t val = *at_base(a, a_rowstride, r, 0);
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = 0; i < tb_num; i++)
|
||||
{
|
||||
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w);
|
||||
}
|
||||
*at_base(c, c_rowstride, r, w) ^= temp;
|
||||
}
|
||||
|
||||
__host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
|
||||
{
|
||||
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
||||
gf.cpy_to_constant();
|
||||
MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols);
|
||||
for (size_t w = 0; w < a.width; w++)
|
||||
{
|
||||
size_t tb_num = min(gf256_num, a.ncols - w * gf256_num);
|
||||
|
||||
dim3 block_tb(THREAD_X, THREAD_Y);
|
||||
dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb.nrows - 1) / block_tb.y + 1);
|
||||
gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.rowstride, b.at_base(w * gf256_num, 0), b.rowstride, tb.width, tb_num * (1 << gf256_len));
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
dim3 block(THREAD_X, THREAD_Y);
|
||||
dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
||||
gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
__host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
|
||||
{
|
||||
assert(a.ncols == b.nrows);
|
||||
MatGF256 c(a.nrows, b.ncols);
|
||||
c.gpu_addmul(a, b, gf);
|
||||
return c;
|
||||
}
|
||||
|
||||
#endif
|
@ -1,95 +0,0 @@
|
||||
#ifndef GF28_CUH
|
||||
#define GF28_CUH
|
||||
|
||||
#include "header.cuh"
|
||||
|
||||
static const set<base_t> irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9};
|
||||
|
||||
class GF28
|
||||
{
|
||||
public:
|
||||
GF28(base_t poly)
|
||||
{
|
||||
assert(irreducible_polynomials_degree_08.count(poly) == 1);
|
||||
this->poly = poly;
|
||||
for (size_t x = 0; x < (1 << base_deg); x++)
|
||||
{
|
||||
mul_table[x][gf28_zero] = gf28_zero;
|
||||
for (size_t d = 0; d < base_deg; d++)
|
||||
{
|
||||
gf28_t val = shift_left(x, d);
|
||||
for (size_t y = (1 << d); y < (1 << (d + 1)); y++)
|
||||
{
|
||||
mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)];
|
||||
if (mul_table[x][y] == gf28_one)
|
||||
{
|
||||
inv_table[x] = y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
inv_table[gf28_zero] = gf28_zero;
|
||||
}
|
||||
|
||||
gf28_t mul(const gf28_t x, const gf28_t y) const
|
||||
{
|
||||
return mul_table[x][y];
|
||||
}
|
||||
|
||||
base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0) const
|
||||
{
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = offset; i < base_num; i++)
|
||||
{
|
||||
set8(temp, mul(val, get8(base, i)), i);
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
gf28_t inv(gf28_t x)
|
||||
{
|
||||
return inv_table[x];
|
||||
}
|
||||
|
||||
friend ostream &operator<<(ostream &out, const GF28 &gf);
|
||||
|
||||
GF28() = delete;
|
||||
GF28(const GF28 &) = delete;
|
||||
GF28(GF28 &&) = delete;
|
||||
GF28 &operator=(const GF28 &) = delete;
|
||||
GF28 &operator=(GF28 &&) = delete;
|
||||
|
||||
gf28_t mul_table[1 << base_deg][1 << base_deg];
|
||||
|
||||
private:
|
||||
gf28_t shift_left(gf28_t x, size_t d)
|
||||
{
|
||||
base_t temp = (base_t)x << d;
|
||||
for (size_t i = base_deg - 1 + d; i > base_deg - 1; i--)
|
||||
{
|
||||
if (temp & (1 << i))
|
||||
{
|
||||
temp ^= poly << (i - base_deg);
|
||||
}
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
base_t poly;
|
||||
gf28_t inv_table[1 << base_deg];
|
||||
};
|
||||
|
||||
ostream &operator<<(ostream &out, const GF28 &gf)
|
||||
{
|
||||
for (size_t x = 0; x < 1 << base_deg; x++)
|
||||
{
|
||||
for (size_t y = 0; y < 1 << base_deg; y++)
|
||||
{
|
||||
printf("%02X ", gf.mul_table[x][y]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
#endif
|
@ -3,10 +3,8 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
// #include <fstream> // matrix
|
||||
|
||||
#include <set> // gf28
|
||||
#include <random> // matrix
|
||||
// matrix
|
||||
// #include <map>
|
||||
// #include <vector>
|
||||
|
||||
@ -17,95 +15,20 @@
|
||||
using namespace std;
|
||||
|
||||
using base_t = uint64_t;
|
||||
using gf28_t = uint8_t;
|
||||
|
||||
static const size_t base_deg = 8;
|
||||
static const size_t base_num = 8;
|
||||
static const size_t base_len = 64;
|
||||
static_assert(base_len == base_deg * base_num && base_len == sizeof(base_t) * 8);
|
||||
static const size_t base_len = sizeof(base_t) * 8;
|
||||
|
||||
static const base_t base_zero = (base_t)0x00'00'00'00'00'00'00'00;
|
||||
static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01;
|
||||
static const gf28_t gf28_zero = (gf28_t)0x00;
|
||||
static const gf28_t gf28_one = (gf28_t)0x01;
|
||||
|
||||
static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;
|
||||
static const base_t base_deg_mask[8] = {
|
||||
(base_t)0x00'00'00'00'00'00'00'FF,
|
||||
(base_t)0x00'00'00'00'00'00'FF'00,
|
||||
(base_t)0x00'00'00'00'00'FF'00'00,
|
||||
(base_t)0x00'00'00'00'FF'00'00'00,
|
||||
(base_t)0x00'00'00'FF'00'00'00'00,
|
||||
(base_t)0x00'00'FF'00'00'00'00'00,
|
||||
(base_t)0x00'FF'00'00'00'00'00'00,
|
||||
(base_t)0xFF'00'00'00'00'00'00'00};
|
||||
|
||||
static const size_t THREAD_X = 32; // 列
|
||||
static const size_t THREAD_Y = base_deg; // 行
|
||||
static const size_t THREAD_Y = 8; // 行
|
||||
|
||||
__constant__ gf28_t d_mul_table[1 << base_deg][1 << base_deg];
|
||||
|
||||
__host__ __device__ base_t *at_pitch(base_t *base, size_t pitch, size_t r, size_t w)
|
||||
__host__ __device__ base_t *at_base(base_t *base, size_t rowstride, size_t r, size_t w)
|
||||
{
|
||||
return base + r * pitch + w;
|
||||
}
|
||||
|
||||
__host__ __device__ inline size_t offset(size_t idx)
|
||||
{
|
||||
return idx << 3;
|
||||
}
|
||||
|
||||
__host__ __device__ inline gf28_t get8(base_t src, size_t idx)
|
||||
{
|
||||
return (gf28_t)(src >> offset(idx));
|
||||
}
|
||||
|
||||
// 确保set8对应位置的值为0
|
||||
__host__ __device__ inline void set8(base_t &dst, size_t idx, gf28_t src)
|
||||
{
|
||||
dst |= (base_t)src << offset(idx);
|
||||
}
|
||||
|
||||
__host__ inline void del8(base_t &dst, size_t idx)
|
||||
{
|
||||
dst &= ~base_deg_mask[idx];
|
||||
}
|
||||
|
||||
__device__ inline base_t mul_base(const gf28_t val, const base_t base, const size_t offset = 0)
|
||||
{
|
||||
if (val == 0)
|
||||
{
|
||||
return base_zero;
|
||||
}
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = offset; i < base_num; i++)
|
||||
{
|
||||
set8(temp, i, d_mul_table[val][get8(base, i)]);
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
__global__ void gpu_mktb_kernel(base_t *r_tb, size_t tb_pitch, base_t *src, size_t s_pitch, size_t width, size_t nrows)
|
||||
{
|
||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (w >= width || r >= nrows)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
gf28_t val = get8(r, 0);
|
||||
base_t s = *at_pitch(src, s_pitch, get8(r, 1), w);
|
||||
base_t d = mul_base(val, s);
|
||||
*at_pitch(r_tb, tb_pitch, r, w) = d;
|
||||
}
|
||||
|
||||
__host__ inline base_t rev8(base_t n)
|
||||
{
|
||||
n = (n & 0xff00ff00ff00ff00ul) >> 8 | (n & 0x00ff00ff00ff00fful) << 8;
|
||||
n = (n & 0xffff0000ffff0000ul) >> 16 | (n & 0x0000ffff0000fffful) << 16;
|
||||
return n >> 32 | n << 32;
|
||||
return base + r * rowstride + w;
|
||||
}
|
||||
|
||||
#define CUDA_CHECK(call) \
|
||||
|
@ -1,55 +0,0 @@
|
||||
#ifndef MULTIPLICATION_CUH
|
||||
#define MULTIPLICATION_CUH
|
||||
|
||||
#include "matrix.cuh"
|
||||
#include "gf28.cuh"
|
||||
|
||||
__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *tb, size_t tb_pitch, base_t *c, size_t c_pitch, size_t tb_num, size_t width, size_t nrows)
|
||||
{
|
||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (w >= width || r >= nrows)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
base_t val = *at_pitch(a, a_pitch, r, 0);
|
||||
base_t temp = base_zero;
|
||||
for (size_t i = 0; i < tb_num; i++)
|
||||
{
|
||||
temp ^= *at_pitch(tb, tb_pitch, i * (1 << base_deg) + get8(val, i), w);
|
||||
}
|
||||
*at_pitch(c, c_pitch, r, w) ^= temp;
|
||||
}
|
||||
|
||||
__host__ void GF28Matrix::gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
|
||||
{
|
||||
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
||||
cudaMemcpyToSymbol(d_mul_table, gf.mul_table, (1 << base_deg) * (1 << base_deg) * sizeof(gf28_t));
|
||||
GF28Matrix tb(base_num * (1 << base_deg), b.ncols);
|
||||
for (size_t w = 0; w < a.width; w++)
|
||||
{
|
||||
size_t tb_num = min(base_num, a.ncols - w * base_num);
|
||||
|
||||
dim3 block_tb(THREAD_X, THREAD_Y);
|
||||
dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb.nrows - 1) / block_tb.y + 1);
|
||||
gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.pitch, b.at_base(w * base_num, 0), b.pitch, tb.width, tb_num * (1 << base_deg));
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
dim3 block(THREAD_X, THREAD_Y);
|
||||
dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
||||
gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.pitch, tb.data, tb.pitch, data, pitch, tb_num, width, nrows);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
}
|
||||
|
||||
__host__ GF28Matrix gpu_mul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
|
||||
{
|
||||
assert(a.ncols == b.nrows);
|
||||
GF28Matrix c(a.nrows, b.ncols);
|
||||
c.gpu_addmul(a, b, gf);
|
||||
return c;
|
||||
}
|
||||
|
||||
#endif
|
@ -2,12 +2,12 @@
|
||||
|
||||
int main()
|
||||
{
|
||||
GF28Matrix a(10, 10);
|
||||
MatGF256 a(10, 10);
|
||||
a.randomize(123);
|
||||
GF28Matrix b(10, 10);
|
||||
MatGF256 b(10, 10);
|
||||
b.randomize(123);
|
||||
GF28Matrix c(10, 10);
|
||||
c.gpu_addmul(a, b, GF28(0b100011101));
|
||||
MatGF256 c(10, 10);
|
||||
c.gpu_addmul(a, b, GF256(0b100011101));
|
||||
cout << a << endl;
|
||||
cout << b << endl;
|
||||
cout << c << endl;
|
||||
|
@ -3,7 +3,7 @@ find_package(GTest REQUIRED) # 查找GTest库
|
||||
include_directories(${PROJECT_SOURCE_DIR}/test) # 添加测试头文件目录
|
||||
|
||||
set(TEST_SRC_FILES
|
||||
"test_gf28.cu"
|
||||
"test_gf256.cu"
|
||||
"test_matrix.cu"
|
||||
)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "test_header.cuh"
|
||||
|
||||
vector<gf28_t> expect_inv_table{
|
||||
vector<gf256_t> expect_inv_table{
|
||||
0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96,
|
||||
0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A,
|
||||
0x6C, 0xED, 0x39, 0x51, 0x60, 0x56, 0x2C, 0x8A, 0x70, 0xD0, 0x1F, 0x4A, 0x26, 0x8B, 0x33, 0x6E,
|
||||
@ -19,11 +19,11 @@ vector<gf28_t> expect_inv_table{
|
||||
0x14, 0x3F, 0xE6, 0xF0, 0x86, 0xB1, 0xE2, 0xF1, 0xFA, 0x74, 0xF3, 0xB4, 0x6D, 0x21, 0xB2, 0x6A,
|
||||
0xE3, 0xE7, 0xB5, 0xEA, 0x03, 0x8F, 0xD3, 0xC9, 0x42, 0xD4, 0xE8, 0x75, 0x7F, 0xFF, 0x7E, 0xFD};
|
||||
|
||||
TEST(TestGF28, Inv)
|
||||
TEST(TestGF256, Inv)
|
||||
{
|
||||
GF28 gf28(0b100011101);
|
||||
for (size_t x = 0; x < 1 << base_deg; x++)
|
||||
GF256 gf256(0b100011101);
|
||||
for (size_t x = 0; x < 1 << gf256_len; x++)
|
||||
{
|
||||
EXPECT_EQ(gf28.inv(x), expect_inv_table[x]);
|
||||
EXPECT_EQ(gf256.inv(x), expect_inv_table[x]);
|
||||
}
|
||||
}
|
@ -3,37 +3,37 @@
|
||||
|
||||
TEST(TestMatrix, Equal)
|
||||
{
|
||||
GF28Matrix a(50, 50);
|
||||
MatGF256 a(50, 50);
|
||||
EXPECT_TRUE(a == base_zero);
|
||||
GF28Matrix v = a.createView(0, 0, 30, 3);
|
||||
MatGF256 v = a.createView(0, 0, 30, 3);
|
||||
EXPECT_TRUE(v == base_zero);
|
||||
a.randomize(1243);
|
||||
EXPECT_TRUE(a == a);
|
||||
EXPECT_TRUE(v == v);
|
||||
GF28Matrix b(50, 50);
|
||||
MatGF256 b(50, 50);
|
||||
b.randomize(1243);
|
||||
EXPECT_FALSE(a == b);
|
||||
}
|
||||
|
||||
TEST(TestMatrix, Xor)
|
||||
{
|
||||
GF28Matrix a(50, 50), b(50, 50);
|
||||
MatGF256 a(50, 50), b(50, 50);
|
||||
a.randomize(1243);
|
||||
b.randomize(1243);
|
||||
GF28Matrix c = a ^ b;
|
||||
MatGF256 c = a ^ b;
|
||||
a ^= c;
|
||||
EXPECT_TRUE(a == b);
|
||||
GF28Matrix va = a.createView(20, 1, 30, 3);
|
||||
GF28Matrix vb = b.createView(10, 2, 20, 4);
|
||||
GF28Matrix vc = va ^ vb;
|
||||
MatGF256 va = a.createView(20, 1, 30, 3);
|
||||
MatGF256 vb = b.createView(10, 2, 20, 4);
|
||||
MatGF256 vc = va ^ vb;
|
||||
va ^= vc;
|
||||
EXPECT_TRUE(va == vb);
|
||||
}
|
||||
|
||||
// TEST(TestMatrix, Basic)
|
||||
// {
|
||||
// GF28Matrix a(50, 50);
|
||||
// GF28Matrix v = a.createView(0, 0, 30, 3);
|
||||
// MatGF256 a(50, 50);
|
||||
// MatGF256 v = a.createView(0, 0, 30, 3);
|
||||
|
||||
// EXPECT_EQ(v.type, GF28Matrix::view);
|
||||
// EXPECT_EQ(v.type, MatGF256::view);
|
||||
// }
|
Loading…
x
Reference in New Issue
Block a user