使用命名空间进行划分

This commit is contained in:
shijin 2024-09-14 16:15:13 +08:00
parent 2375705792
commit e73b158a37
15 changed files with 826 additions and 822 deletions

View File

@ -1,6 +1,8 @@
#include <benchmark/benchmark.h> #include <benchmark/benchmark.h>
#include "cuelim.cuh" #include "cuelim.cuh"
using namespace gf256;
template <MatGF256 (*GpuFunc)(const MatGF256 &, const MatGF256 &, const GF256 &)> template <MatGF256 (*GpuFunc)(const MatGF256 &, const MatGF256 &, const GF256 &)>
void bench_gf256_mul(benchmark::State &state) void bench_gf256_mul(benchmark::State &state)
{ {

View File

@ -1,6 +1,8 @@
#include <benchmark/benchmark.h> #include <benchmark/benchmark.h>
#include "test_header.cuh" #include "test_header.cuh"
using namespace gfp;
static void bench_gfp(benchmark::State &state) static void bench_gfp(benchmark::State &state)
{ {
uint_fast32_t seed = 41921095; uint_fast32_t seed = 41921095;

View File

@ -3,190 +3,193 @@
#include "gf256_mat.cuh" #include "gf256_mat.cuh"
void MatGF256::cpu_swap_row(size_t r1, size_t r2) namespace gf256
{ {
if (r1 == r2) void MatGF256::cpu_swap_row(size_t r1, size_t r2)
{ {
return; if (r1 == r2)
}
base_t *p1 = at_base(r1, 0);
base_t *p2 = at_base(r2, 0);
for (size_t i = 0; i < width; i++)
{
base_t temp = p1[i];
p1[i] = p2[i];
p2[i] = temp;
}
}
size_t gf256_cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, const GF256 &gf)
{
size_t rank = 0;
size_t pivot[gf256_num];
size_t next[gf256_num];
for (size_t pivot_col = 0; pivot_col < gf256_num; pivot_col++)
{
for (size_t r = rank; r < base_col_len; r++)
{ {
for (size_t i = 0; i < rank; i++) return;
}
base_t *p1 = at_base(r1, 0);
base_t *p2 = at_base(r2, 0);
for (size_t i = 0; i < width; i++)
{
base_t temp = p1[i];
p1[i] = p2[i];
p2[i] = temp;
}
}
size_t cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, const GF256 &gf)
{
size_t rank = 0;
size_t pivot[gf256_num];
size_t next[gf256_num];
for (size_t pivot_col = 0; pivot_col < gf256_num; pivot_col++)
{
for (size_t r = rank; r < base_col_len; r++)
{ {
if (next[i] == r) for (size_t i = 0; i < rank; i++)
{ {
base_col[r] ^= gf.mul_base(get8(base_col[r], pivot[i]), base_col[i], pivot[i] + 1); if (next[i] == r)
next[i]++; {
base_col[r] ^= gf.mul_base(get8(base_col[r], pivot[i]), base_col[i], pivot[i] + 1);
next[i]++;
}
}
if (get8(base_col[r], pivot_col) != 0)
{
p_col.push_back(w * gf256_num + pivot_col);
p_row.push_back(st_r + r);
if (r != rank)
{
base_t temp = base_col[rank];
base_col[rank] = base_col[r];
base_col[r] = temp;
}
base_col[rank] = concat8(base_col[rank], pivot_col + 1, gf.mul_base(gf.inv(get8(base_col[rank], pivot_col)), base_col[rank], pivot_col + 1));
pivot[rank] = pivot_col;
next[rank] = rank + 1;
rank++;
break;
} }
} }
}
return rank;
}
if (get8(base_col[r], pivot_col) != 0) __global__ void gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width)
{
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
if (w >= width)
{
return;
}
base_t temp[gf256_num];
for (size_t r = 0; r < src_rank; r++)
{
temp[r] = *at_base(src, s_rowstride, r, w);
}
for (size_t r = 0; r < src_rank; r++)
{
for (size_t i = 0; i < r; i++)
{ {
p_col.push_back(w * gf256_num + pivot_col); temp[r] ^= mul_base(get8(spL[r], i), temp[i]);
p_row.push_back(st_r + r); }
if (r != rank) temp[r] = mul_base(get8(spL[r], r), temp[r]);
}
for (size_t rr = 1; rr < src_rank; rr++)
{
size_t r = src_rank - 1 - rr;
for (size_t i = r + 1; i < src_rank; i++)
{
temp[r] ^= mul_base(get8(spL[r], i), temp[i]);
}
}
for (size_t r = 0; r < src_rank; r++)
{
*at_base(src, s_rowstride, r, w) = temp[r];
}
}
__global__ void gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows)
{
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
if (w >= width || r >= nrows || (r >= st_skip && r < st_skip + rank))
{
return;
}
base_t val = idx[r];
base_t temp = base_zero;
for (size_t i = 0; i < rank; i++)
{
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, get8(pivot_base, i)), w);
}
*at_base(data, rowstride, r, w) ^= temp;
}
__managed__ base_t spL[gf256_num];
__host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
{
gf.cpy_to_constant();
MatGF256 tb(gf256_num * (1 << gf256_len), ncols);
base_t *base_col;
cudaMallocManaged(&base_col, nrows * sizeof(base_t));
base_t *idx;
cudaMallocManaged(&idx, nrows * sizeof(base_t));
size_t rank = 0;
vector<size_t> p_col, p_row;
progress::ProgressBar pb("GPU ELIMINATE", width);
for (size_t w = 0; w < width; w++, pb.tick_display())
{
CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault));
size_t src_rank = cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf);
if (src_rank == 0)
{
continue;
}
for (size_t i = 0; i < src_rank; i++)
{
cpu_swap_row(rank + i, p_row[rank + i]);
spL[i] = base_zero;
}
base_t pivot_base = base_zero;
for (size_t r = 0; r < src_rank; r++)
{
size_t loc = (p_col[rank + r] - w * gf256_num);
set8(spL[r], gf.inv(get8(base_col[rank + r], loc)), r);
for (size_t i = 0; i < r; i++)
{ {
base_t temp = base_col[rank]; set8(spL[i], get8(base_col[rank + i], loc), r);
base_col[rank] = base_col[r];
base_col[r] = temp;
} }
base_col[rank] = concat8(base_col[rank], pivot_col + 1, gf.mul_base(gf.inv(get8(base_col[rank], pivot_col)), base_col[rank], pivot_col + 1)); for (size_t i = r + 1; i < src_rank; i++)
pivot[rank] = pivot_col; {
next[rank] = rank + 1; set8(spL[i], get8(base_col[rank + i], loc), r);
rank++; }
set8(pivot_base, loc, r);
}
dim3 block_src(THREAD_X);
dim3 grid_src((width - w - 1) / block_src.x + 1);
gpu_mksrc_kernel<<<grid_src, block_src>>>(at_base(rank, w), rowstride, spL, src_rank, width);
cudaDeviceSynchronize();
dim3 block_tb(THREAD_X, THREAD_Y);
dim3 grid_tb((width - w - 1) / block_tb.x + 1, (src_rank * (1 << gf256_len) - 1) / block_tb.y + 1);
gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.rowstride, at_base(rank, w), rowstride, tb.width);
cudaDeviceSynchronize();
CUDA_CHECK(cudaMemcpy2D(idx, sizeof(base_t), at_base(0, w), rowstride * sizeof(base_t), sizeof(base_t), nrows, cudaMemcpyDefault));
dim3 block(THREAD_X, THREAD_Y);
dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
gpu_elim_kernel<<<grid, block>>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows);
cudaDeviceSynchronize();
rank += src_rank;
if (rank == nrows)
{
break; break;
} }
} }
cudaFree(base_col);
cudaFree(idx);
return {rank, p_col, p_row};
} }
return rank;
}
__global__ void gf256_gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width)
{
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
if (w >= width)
{
return;
}
base_t temp[gf256_num];
for (size_t r = 0; r < src_rank; r++)
{
temp[r] = *at_base(src, s_rowstride, r, w);
}
for (size_t r = 0; r < src_rank; r++)
{
for (size_t i = 0; i < r; i++)
{
temp[r] ^= mul_base(get8(spL[r], i), temp[i]);
}
temp[r] = mul_base(get8(spL[r], r), temp[r]);
}
for (size_t rr = 1; rr < src_rank; rr++)
{
size_t r = src_rank - 1 - rr;
for (size_t i = r + 1; i < src_rank; i++)
{
temp[r] ^= mul_base(get8(spL[r], i), temp[i]);
}
}
for (size_t r = 0; r < src_rank; r++)
{
*at_base(src, s_rowstride, r, w) = temp[r];
}
}
__global__ void gf256_gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows)
{
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
if (w >= width || r >= nrows || (r >= st_skip && r < st_skip + rank))
{
return;
}
base_t val = idx[r];
base_t temp = base_zero;
for (size_t i = 0; i < rank; i++)
{
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, get8(pivot_base, i)), w);
}
*at_base(data, rowstride, r, w) ^= temp;
}
__managed__ base_t spL[gf256_num];
__host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
{
gf.cpy_to_constant();
MatGF256 tb(gf256_num * (1 << gf256_len), ncols);
base_t *base_col;
cudaMallocManaged(&base_col, nrows * sizeof(base_t));
base_t *idx;
cudaMallocManaged(&idx, nrows * sizeof(base_t));
size_t rank = 0;
vector<size_t> p_col, p_row;
progress::ProgressBar pb("GPU ELIMINATE", width);
for (size_t w = 0; w < width; w++, pb.tick_display())
{
CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault));
size_t src_rank = gf256_cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf);
if (src_rank == 0)
{
continue;
}
for (size_t i = 0; i < src_rank; i++)
{
cpu_swap_row(rank + i, p_row[rank + i]);
spL[i] = base_zero;
}
base_t pivot_base = base_zero;
for (size_t r = 0; r < src_rank; r++)
{
size_t loc = (p_col[rank + r] - w * gf256_num);
set8(spL[r], gf.inv(get8(base_col[rank + r], loc)), r);
for (size_t i = 0; i < r; i++)
{
set8(spL[i], get8(base_col[rank + i], loc), r);
}
for (size_t i = r + 1; i < src_rank; i++)
{
set8(spL[i], get8(base_col[rank + i], loc), r);
}
set8(pivot_base, loc, r);
}
dim3 block_src(THREAD_X);
dim3 grid_src((width - w - 1) / block_src.x + 1);
gf256_gpu_mksrc_kernel<<<grid_src, block_src>>>(at_base(rank, w), rowstride, spL, src_rank, width);
cudaDeviceSynchronize();
dim3 block_tb(THREAD_X, THREAD_Y);
dim3 grid_tb((width - w - 1) / block_tb.x + 1, (src_rank * (1 << gf256_len) - 1) / block_tb.y + 1);
gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.rowstride, at_base(rank, w), rowstride, tb.width);
cudaDeviceSynchronize();
CUDA_CHECK(cudaMemcpy2D(idx, sizeof(base_t), at_base(0, w), rowstride * sizeof(base_t), sizeof(base_t), nrows, cudaMemcpyDefault));
dim3 block(THREAD_X, THREAD_Y);
dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
gf256_gpu_elim_kernel<<<grid, block>>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows);
cudaDeviceSynchronize();
rank += src_rank;
if (rank == nrows)
{
break;
}
}
cudaFree(base_col);
cudaFree(idx);
return {rank, p_col, p_row};
} }
#endif #endif

View File

@ -4,189 +4,191 @@
#include "../header.cuh" #include "../header.cuh"
#include <set> #include <set>
using gf256_t = uint8_t; namespace gf256
static const size_t gf256_len = sizeof(gf256_t) * 8;
static const size_t gf256_num = base_len / gf256_len;
static const gf256_t gf256_zero = (gf256_t)0x00;
static const gf256_t gf256_one = (gf256_t)0x01;
static const gf256_t gf256_fullmask = (gf256_t)0xFF;
static const base_t gf256_mask[8] = {
(base_t)0x00'00'00'00'00'00'00'FF,
(base_t)0x00'00'00'00'00'00'FF'00,
(base_t)0x00'00'00'00'00'FF'00'00,
(base_t)0x00'00'00'00'FF'00'00'00,
(base_t)0x00'00'00'FF'00'00'00'00,
(base_t)0x00'00'FF'00'00'00'00'00,
(base_t)0x00'FF'00'00'00'00'00'00,
(base_t)0xFF'00'00'00'00'00'00'00};
__host__ __device__ inline size_t offset8(size_t idx)
{ {
return idx << 3; using gf256_t = uint8_t;
}
__host__ __device__ inline gf256_t get8(base_t src, size_t idx) static const size_t gf256_len = sizeof(gf256_t) * 8;
{ static const size_t gf256_num = base_len / gf256_len;
return (gf256_t)(src >> offset8(idx));
}
// 确保set8对应位置的值为0 static const gf256_t gf256_zero = (gf256_t)0x00;
__host__ __device__ inline void set8(base_t &dst, gf256_t src, size_t idx) static const gf256_t gf256_one = (gf256_t)0x01;
{ static const gf256_t gf256_fullmask = (gf256_t)0xFF;
dst |= (base_t)src << offset8(idx);
}
__host__ inline void del8(base_t &dst, size_t idx) static const base_t gf256_mask[8] = {
{ (base_t)0x00'00'00'00'00'00'00'FF,
dst &= ~gf256_mask[idx]; (base_t)0x00'00'00'00'00'00'FF'00,
} (base_t)0x00'00'00'00'00'FF'00'00,
(base_t)0x00'00'00'00'FF'00'00'00,
(base_t)0x00'00'00'FF'00'00'00'00,
(base_t)0x00'00'FF'00'00'00'00'00,
(base_t)0x00'FF'00'00'00'00'00'00,
(base_t)0xFF'00'00'00'00'00'00'00};
__host__ inline base_t concat8(base_t dst_l, size_t idx_l, base_t dst_r) __host__ __device__ inline size_t offset8(size_t idx)
{
if (idx_l == 0)
{ {
return dst_r; return idx << 3;
}
if (idx_l == gf256_num)
{
return dst_l;
}
return (dst_l & (base_fullmask >> (base_len - offset8(idx_l)))) | (dst_r & (base_fullmask << offset8(idx_l)));
}
__host__ inline base_t rev8(base_t n)
{
n = (n & (base_t)0xFF'00'FF'00'FF'00'FF'00) >> 8 | (n & (base_t)0x00'FF'00'FF'00'FF'00'FF) << 8;
n = (n & (base_t)0xFF'FF'00'00'FF'FF'00'00) >> 16 | (n & (base_t)0x00'00'FF'FF'00'00'FF'FF) << 16;
return n >> 32 | n << 32;
}
__constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len];
__device__ inline base_t mul_base(const gf256_t val, const base_t base)
{
if (val == 0)
{
return base_zero;
}
base_t temp = base_zero;
for (size_t i = 0; i < gf256_len; i++)
{
set8(temp, d_mul_table[val][get8(base, i)], i);
}
return temp;
}
__global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width)
{
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
if (w >= width)
{
return;
} }
gf256_t val = get8(r, 0); __host__ __device__ inline gf256_t get8(base_t src, size_t idx)
base_t s = *at_base(src, s_rowstride, get8(r, 1), w);
base_t d = mul_base(val, s);
*at_base(tb, tb_rowstride, r, w) = d;
}
static const set<base_t> irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9};
class GF256
{
public:
GF256(base_t poly)
{ {
assert(irreducible_polynomials_degree_08.count(poly) == 1); return (gf256_t)(src >> offset8(idx));
this->poly = poly; }
for (size_t x = 0; x < (1 << gf256_len); x++)
// 确保set8对应位置的值为0
__host__ __device__ inline void set8(base_t &dst, gf256_t src, size_t idx)
{
dst |= (base_t)src << offset8(idx);
}
__host__ inline void del8(base_t &dst, size_t idx)
{
dst &= ~gf256_mask[idx];
}
__host__ inline base_t concat8(base_t dst_l, size_t idx_l, base_t dst_r)
{
if (idx_l == 0)
{ {
mul_table[x][gf256_zero] = gf256_zero; return dst_r;
for (size_t d = 0; d < gf256_len; d++) }
if (idx_l == gf256_num)
{
return dst_l;
}
return (dst_l & (base_fullmask >> (base_len - offset8(idx_l)))) | (dst_r & (base_fullmask << offset8(idx_l)));
}
__host__ inline base_t rev8(base_t n)
{
n = (n & (base_t)0xFF'00'FF'00'FF'00'FF'00) >> 8 | (n & (base_t)0x00'FF'00'FF'00'FF'00'FF) << 8;
n = (n & (base_t)0xFF'FF'00'00'FF'FF'00'00) >> 16 | (n & (base_t)0x00'00'FF'FF'00'00'FF'FF) << 16;
return n >> 32 | n << 32;
}
__constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len];
__device__ inline base_t mul_base(const gf256_t val, const base_t base)
{
if (val == 0)
{
return base_zero;
}
base_t temp = base_zero;
for (size_t i = 0; i < gf256_len; i++)
{
set8(temp, d_mul_table[val][get8(base, i)], i);
}
return temp;
}
__global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width)
{
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
if (w >= width)
{
return;
}
gf256_t val = get8(r, 0);
base_t s = *at_base(src, s_rowstride, get8(r, 1), w);
base_t d = mul_base(val, s);
*at_base(tb, tb_rowstride, r, w) = d;
}
static const set<base_t> irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9};
class GF256
{
public:
GF256(base_t poly)
{
assert(irreducible_polynomials_degree_08.count(poly) == 1);
this->poly = poly;
for (size_t x = 0; x < (1 << gf256_len); x++)
{ {
gf256_t val = shift_left(x, d); mul_table[x][gf256_zero] = gf256_zero;
for (size_t y = (1 << d); y < (1 << (d + 1)); y++) for (size_t d = 0; d < gf256_len; d++)
{ {
mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)]; gf256_t val = shift_left(x, d);
if (mul_table[x][y] == gf256_one) for (size_t y = (1 << d); y < (1 << (d + 1)); y++)
{ {
inv_table[x] = y; mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)];
if (mul_table[x][y] == gf256_one)
{
inv_table[x] = y;
}
} }
} }
} }
inv_table[gf256_zero] = gf256_zero;
} }
inv_table[gf256_zero] = gf256_zero;
}
gf256_t mul(const gf256_t x, const gf256_t y) const gf256_t mul(const gf256_t x, const gf256_t y) const
{
return mul_table[x][y];
}
base_t mul_base(const gf256_t val, const base_t base, const size_t offset = 0) const
{
base_t temp = base_zero;
for (size_t i = offset; i < gf256_num; i++)
{ {
set8(temp, mul(val, get8(base, i)), i); return mul_table[x][y];
} }
return temp;
}
gf256_t inv(gf256_t x) const base_t mul_base(const gf256_t val, const base_t base, const size_t offset = 0) const
{
return inv_table[x];
}
inline cudaError_t cpy_to_constant() const
{
return cudaMemcpyToSymbol(d_mul_table, mul_table, (1 << gf256_len) * (1 << gf256_len) * sizeof(gf256_t));
}
friend ostream &operator<<(ostream &out, const GF256 &gf);
GF256() = delete;
GF256(const GF256 &) = delete;
GF256(GF256 &&) = delete;
GF256 &operator=(const GF256 &) = delete;
GF256 &operator=(GF256 &&) = delete;
private:
gf256_t shift_left(gf256_t x, size_t d)
{
base_t temp = (base_t)x << d;
for (size_t i = gf256_len - 1 + d; i > gf256_len - 1; i--)
{ {
if (temp & (1 << i)) base_t temp = base_zero;
for (size_t i = offset; i < gf256_num; i++)
{ {
temp ^= poly << (i - gf256_len); set8(temp, mul(val, get8(base, i)), i);
} }
return temp;
} }
return temp;
}
base_t poly; gf256_t inv(gf256_t x) const
gf256_t inv_table[1 << gf256_num];
gf256_t mul_table[1 << gf256_num][1 << gf256_num];
};
ostream &operator<<(ostream &out, const GF256 &gf)
{
for (size_t x = 0; x < 1 << gf256_len; x++)
{
for (size_t y = 0; y < 1 << gf256_len; y++)
{ {
printf("%02X ", gf.mul_table[x][y]); return inv_table[x];
} }
printf("\n");
}
return out;
}
inline cudaError_t cpy_to_constant() const
{
return cudaMemcpyToSymbol(d_mul_table, mul_table, (1 << gf256_len) * (1 << gf256_len) * sizeof(gf256_t));
}
friend ostream &operator<<(ostream &out, const GF256 &gf);
GF256() = delete;
GF256(const GF256 &) = delete;
GF256(GF256 &&) = delete;
GF256 &operator=(const GF256 &) = delete;
GF256 &operator=(GF256 &&) = delete;
private:
gf256_t shift_left(gf256_t x, size_t d)
{
base_t temp = (base_t)x << d;
for (size_t i = gf256_len - 1 + d; i > gf256_len - 1; i--)
{
if (temp & (1 << i))
{
temp ^= poly << (i - gf256_len);
}
}
return temp;
}
base_t poly;
gf256_t inv_table[1 << gf256_num];
gf256_t mul_table[1 << gf256_num][1 << gf256_num];
};
ostream &operator<<(ostream &out, const GF256 &gf)
{
for (size_t x = 0; x < 1 << gf256_len; x++)
{
for (size_t y = 0; y < 1 << gf256_len; y++)
{
printf("%02X ", gf.mul_table[x][y]);
}
printf("\n");
}
return out;
}
}
#endif #endif

View File

@ -7,220 +7,223 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
struct ElimResult namespace gf256
{ {
size_t rank; struct ElimResult
vector<size_t> pivot;
vector<size_t> swap_row;
};
class MatGF256
{
public:
enum MatType
{ {
root, size_t rank;
window, vector<size_t> pivot;
moved, vector<size_t> swap_row;
}; };
// 只能构造root矩阵
MatGF256(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) class MatGF256
{ {
width = (ncols - 1) / gf256_num + 1; public:
rowstride = ((width - 1) / 4 + 1) * 4; // 以32字节4*64bit为单位对齐 enum MatType
CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(base_t)));
CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(base_t)));
}
// 只能以base_t为单位建立window矩阵
MatGF256(const MatGF256 &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols((end_wj == src.width ? src.ncols : end_wj * gf256_num) - begin_wi * gf256_num), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi))
{
assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width);
}
// 只能拷贝构造root矩阵
MatGF256(const MatGF256 &m) : MatGF256(m.nrows, m.ncols)
{
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
}
MatGF256(MatGF256 &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data)
{
m.type = moved;
m.data = nullptr;
}
MatGF256 &operator=(const MatGF256 &m)
{
if (this == &m)
{ {
root,
window,
moved,
};
// 只能构造root矩阵
MatGF256(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
{
width = (ncols - 1) / gf256_num + 1;
rowstride = ((width - 1) / 4 + 1) * 4; // 以32字节4*64bit为单位对齐
CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(base_t)));
CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(base_t)));
}
// 只能以base_t为单位建立window矩阵
MatGF256(const MatGF256 &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols((end_wj == src.width ? src.ncols : end_wj * gf256_num) - begin_wi * gf256_num), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi))
{
assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width);
}
// 只能拷贝构造root矩阵
MatGF256(const MatGF256 &m) : MatGF256(m.nrows, m.ncols)
{
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
}
MatGF256(MatGF256 &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data)
{
m.type = moved;
m.data = nullptr;
}
MatGF256 &operator=(const MatGF256 &m)
{
if (this == &m)
{
return *this;
}
assert(nrows == m.nrows && ncols == m.ncols);
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
return *this; return *this;
} }
assert(nrows == m.nrows && ncols == m.ncols); MatGF256 &operator=(MatGF256 &&m) noexcept
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault));
return *this;
}
MatGF256 &operator=(MatGF256 &&m) noexcept
{
if (this == &m)
{ {
if (this == &m)
{
return *this;
}
if (type == root)
{
CUDA_CHECK(cudaFree(data));
}
nrows = m.nrows;
ncols = m.ncols;
width = m.width;
rowstride = m.rowstride;
type = m.type;
data = m.data;
m.type = moved;
m.data = nullptr;
return *this; return *this;
} }
if (type == root)
{
CUDA_CHECK(cudaFree(data));
}
nrows = m.nrows;
ncols = m.ncols;
width = m.width;
rowstride = m.rowstride;
type = m.type;
data = m.data;
m.type = moved;
m.data = nullptr;
return *this;
}
~MatGF256() ~MatGF256()
{
if (type == root)
{ {
CUDA_CHECK(cudaFree(data)); if (type == root)
}
}
inline base_t *at_base(size_t r, size_t w) const
{
return data + r * rowstride + w;
}
void randomize(uint_fast32_t seed)
{
assert(type == root);
static default_random_engine e(seed);
static uniform_int_distribution<base_t> d;
base_t lastmask = base_fullmask >> (width * base_len - ncols * gf256_len);
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{ {
*at_base(r, w) = d(e); CUDA_CHECK(cudaFree(data));
}
*at_base(r, width - 1) &= lastmask;
}
}
// 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列
void randomize(size_t rank_col, uint_fast32_t seed)
{
assert(nrows <= rank_col && rank_col <= ncols);
randomize(seed);
vector<size_t> pivot(rank_col);
iota(pivot.begin(), pivot.end(), 0);
random_shuffle(pivot.begin(), pivot.end());
pivot.resize(nrows);
sort(pivot.begin(), pivot.end());
vector<base_t> pivotmask(width, base_fullmask);
for (size_t r = 0; r < nrows; r++)
{
del8(pivotmask[pivot[r] / gf256_num], pivot[r] % gf256_num);
}
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < pivot[r] / gf256_num; w++)
{
*at_base(r, w) = base_zero;
}
base_t *now = at_base(r, pivot[r] / gf256_num);
*now = concat8(base_zero, pivot[r] % gf256_num + 1, *now & pivotmask[pivot[r] / gf256_num]);
set8(*now, gf256_one, pivot[r] % gf256_num);
for (size_t w = pivot[r] / gf256_num + 1; w < rank_col / gf256_num + 1; w++)
{
*at_base(r, w) &= pivotmask[w];
} }
} }
}
bool operator==(const MatGF256 &m) const inline base_t *at_base(size_t r, size_t w) const
{
if (nrows != m.nrows || ncols != m.ncols)
{ {
return false; return data + r * rowstride + w;
} }
for (size_t r = 0; r < nrows; r++)
void randomize(uint_fast32_t seed)
{ {
for (size_t w = 0; w < width; w++) assert(type == root);
static default_random_engine e(seed);
static uniform_int_distribution<base_t> d;
base_t lastmask = base_fullmask >> (width * base_len - ncols * gf256_len);
for (size_t r = 0; r < nrows; r++)
{ {
if (*at_base(r, w) != *m.at_base(r, w)) for (size_t w = 0; w < width; w++)
{ {
return false; *at_base(r, w) = d(e);
}
*at_base(r, width - 1) &= lastmask;
}
}
// 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列
void randomize(size_t rank_col, uint_fast32_t seed)
{
assert(nrows <= rank_col && rank_col <= ncols);
randomize(seed);
vector<size_t> pivot(rank_col);
iota(pivot.begin(), pivot.end(), 0);
random_shuffle(pivot.begin(), pivot.end());
pivot.resize(nrows);
sort(pivot.begin(), pivot.end());
vector<base_t> pivotmask(width, base_fullmask);
for (size_t r = 0; r < nrows; r++)
{
del8(pivotmask[pivot[r] / gf256_num], pivot[r] % gf256_num);
}
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < pivot[r] / gf256_num; w++)
{
*at_base(r, w) = base_zero;
}
base_t *now = at_base(r, pivot[r] / gf256_num);
*now = concat8(base_zero, pivot[r] % gf256_num + 1, *now & pivotmask[pivot[r] / gf256_num]);
set8(*now, gf256_one, pivot[r] % gf256_num);
for (size_t w = pivot[r] / gf256_num + 1; w < rank_col / gf256_num + 1; w++)
{
*at_base(r, w) &= pivotmask[w];
} }
} }
} }
return true;
}
bool operator==(const base_t base) const bool operator==(const MatGF256 &m) const
{
for (size_t r = 0; r < nrows; r++)
{ {
for (size_t w = 0; w < width; w++) if (nrows != m.nrows || ncols != m.ncols)
{ {
if (*at_base(r, w) != base) return false;
}
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{ {
return false; if (*at_base(r, w) != *m.at_base(r, w))
{
return false;
}
}
}
return true;
}
bool operator==(const base_t base) const
{
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{
if (*at_base(r, w) != base)
{
return false;
}
}
}
return true;
}
void operator^=(const MatGF256 &m)
{
assert(nrows == m.nrows && ncols == m.ncols);
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{
*at_base(r, w) ^= *m.at_base(r, w);
} }
} }
} }
return true; MatGF256 operator^(const MatGF256 &m) const
}
void operator^=(const MatGF256 &m)
{
assert(nrows == m.nrows && ncols == m.ncols);
for (size_t r = 0; r < nrows; r++)
{ {
for (size_t w = 0; w < width; w++) MatGF256 temp(*this);
temp ^= m;
return temp;
}
void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
// size_t cpu_elim_base(base_t *base_col, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, base_t step[gf256_num], const GF256 &gf);
void cpu_swap_row(size_t r1, size_t r2);
// void cpu_mul_row(size_t r, gf256_t val, const GF256 &gf);
ElimResult gpu_elim(const GF256 &gf);
friend ostream &operator<<(ostream &out, const MatGF256 &m);
size_t nrows, ncols, width;
private:
MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
size_t rowstride;
MatType type;
base_t *data;
};
ostream &operator<<(ostream &out, const MatGF256 &m)
{
for (size_t r = 0; r < m.nrows; r++)
{
for (size_t w = 0; w < m.width; w++)
{ {
*at_base(r, w) ^= *m.at_base(r, w); printf("%016lX ", rev8(*m.at_base(r, w)));
} }
printf("\n");
} }
return out;
} }
MatGF256 operator^(const MatGF256 &m) const
{
MatGF256 temp(*this);
temp ^= m;
return temp;
}
void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
// size_t cpu_elim_base(base_t *base_col, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, base_t step[gf256_num], const GF256 &gf);
void cpu_swap_row(size_t r1, size_t r2);
// void cpu_mul_row(size_t r, gf256_t val, const GF256 &gf);
ElimResult gpu_elim(const GF256 &gf);
friend ostream &operator<<(ostream &out, const MatGF256 &m);
size_t nrows, ncols, width;
private:
MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
size_t rowstride;
MatType type;
base_t *data;
};
ostream &operator<<(ostream &out, const MatGF256 &m)
{
for (size_t r = 0; r < m.nrows; r++)
{
for (size_t w = 0; w < m.width; w++)
{
printf("%016lX ", rev8(*m.at_base(r, w)));
}
printf("\n");
}
return out;
} }
#endif #endif

View File

@ -3,54 +3,57 @@
#include "gf256_mat.cuh" #include "gf256_mat.cuh"
__global__ void gf256_gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows) namespace gf256
{ {
size_t w = blockIdx.x * blockDim.x + threadIdx.x; __global__ void gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows)
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
if (w >= width || r >= nrows)
{ {
return; size_t w = blockIdx.x * blockDim.x + threadIdx.x;
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
if (w >= width || r >= nrows)
{
return;
}
base_t val = *at_base(a, a_rowstride, r, 0);
base_t temp = base_zero;
for (size_t i = 0; i < tb_num; i++)
{
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w);
}
*at_base(c, c_rowstride, r, w) ^= temp;
} }
base_t val = *at_base(a, a_rowstride, r, 0); __host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
base_t temp = base_zero;
for (size_t i = 0; i < tb_num; i++)
{ {
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w); assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
gf.cpy_to_constant();
MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols);
progress::ProgressBar pb("GPU MULTIPLY", a.width);
for (size_t w = 0; w < a.width; w++, pb.tick_display())
{
size_t tb_num = min(gf256_num, a.ncols - w * gf256_num);
dim3 block_tb(THREAD_X, THREAD_Y);
dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb_num * (1 << gf256_len) - 1) / block_tb.y + 1);
gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.rowstride, b.at_base(w * gf256_num, 0), b.rowstride, tb.width);
cudaDeviceSynchronize();
dim3 block(THREAD_X, THREAD_Y);
dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows);
cudaDeviceSynchronize();
}
} }
*at_base(c, c_rowstride, r, w) ^= temp;
}
__host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf) __host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
{
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
gf.cpy_to_constant();
MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols);
progress::ProgressBar pb("GPU MULTIPLY", a.width);
for (size_t w = 0; w < a.width; w++, pb.tick_display())
{ {
size_t tb_num = min(gf256_num, a.ncols - w * gf256_num); assert(a.ncols == b.nrows);
MatGF256 c(a.nrows, b.ncols);
dim3 block_tb(THREAD_X, THREAD_Y); c.gpu_addmul(a, b, gf);
dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb_num * (1 << gf256_len) - 1) / block_tb.y + 1); return c;
gpu_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.rowstride, b.at_base(w * gf256_num, 0), b.rowstride, tb.width);
cudaDeviceSynchronize();
dim3 block(THREAD_X, THREAD_Y);
dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
gf256_gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows);
cudaDeviceSynchronize();
} }
} }
__host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
{
assert(a.ncols == b.nrows);
MatGF256 c(a.nrows, b.ncols);
c.gpu_addmul(a, b, gf);
return c;
}
#endif #endif

View File

@ -3,25 +3,25 @@
#include "../header.cuh" #include "../header.cuh"
using gfp_t = uint32_t; namespace gfp
#define gfp_bits 32
static_assert(sizeof(gfp_t) * 8 == gfp_bits);
static const gfp_t gfp = 65521;
static const gfp_t gfp_zero = (gfp_t)0;
static const gfp_t gfp_one = (gfp_t)1;
static const gfp_t gfp_fullmask = (gfp_t)0xFF'FF;
__managed__ gfp_t gfp_inv_table[gfp];
void init_inv_table()
{ {
gfp_inv_table[0] = 0; using gfp_t = uint32_t;
gfp_inv_table[1] = 1;
for (int i = 2; i < gfp; ++i) static const gfp_t gfprime = 65521;
gfp_inv_table[i] = (gfp - gfp / i) * gfp_inv_table[gfp % i] % gfp;
static const gfp_t gfp_zero = (gfp_t)0;
static const gfp_t gfp_one = (gfp_t)1;
static const gfp_t gfp_fullmask = (gfp_t)0xFF'FF;
__managed__ gfp_t gfp_inv_table[gfprime];
void init_inv_table()
{
gfp_inv_table[0] = 0;
gfp_inv_table[1] = 1;
for (int i = 2; i < gfprime; ++i)
gfp_inv_table[i] = (gfprime - gfprime / i) * gfp_inv_table[gfprime % i] % gfprime;
}
} }
#endif #endif

View File

@ -7,236 +7,235 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
class MatGFP namespace gfp
{ {
public: class MatGFP
enum MatType
{ {
root, public:
window, enum MatType
moved,
};
// 只能构造root矩阵
MatGFP(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
{
width = ncols;
rowstride = ((width - 1) / 8 + 1) * 8; // 以32字节8*32bit为单位对齐
CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(gfp_t)));
CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(gfp_t)));
}
// 只能以gfp_t为单位建立window矩阵
MatGFP(const MatGFP &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols(end_wj - begin_wi), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi))
{
assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width);
}
// 只能拷贝构造root矩阵
MatGFP(const MatGFP &m) : MatGFP(m.nrows, m.ncols)
{
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault));
}
MatGFP(MatGFP &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data)
{
m.type = moved;
m.data = nullptr;
}
MatGFP &operator=(const MatGFP &m)
{
if (this == &m)
{ {
root,
window,
moved,
};
// 只能构造root矩阵
MatGFP(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root)
{
width = ncols;
rowstride = ((width - 1) / 8 + 1) * 8; // 以32字节8*32bit为单位对齐
CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(gfp_t)));
CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(gfp_t)));
}
// 只能以gfp_t为单位建立window矩阵
MatGFP(const MatGFP &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols(end_wj - begin_wi), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi))
{
assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width);
}
// 只能拷贝构造root矩阵
MatGFP(const MatGFP &m) : MatGFP(m.nrows, m.ncols)
{
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault));
}
MatGFP(MatGFP &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data)
{
m.type = moved;
m.data = nullptr;
}
MatGFP &operator=(const MatGFP &m)
{
if (this == &m)
{
return *this;
}
assert(nrows == m.nrows && ncols == m.ncols);
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault));
return *this; return *this;
} }
assert(nrows == m.nrows && ncols == m.ncols); MatGFP &operator=(MatGFP &&m) noexcept
CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault));
return *this;
}
MatGFP &operator=(MatGFP &&m) noexcept
{
if (this == &m)
{ {
if (this == &m)
{
return *this;
}
if (type == root)
{
CUDA_CHECK(cudaFree(data));
}
nrows = m.nrows;
ncols = m.ncols;
width = m.width;
rowstride = m.rowstride;
type = m.type;
data = m.data;
m.type = moved;
m.data = nullptr;
return *this; return *this;
} }
if (type == root)
{
CUDA_CHECK(cudaFree(data));
}
nrows = m.nrows;
ncols = m.ncols;
width = m.width;
rowstride = m.rowstride;
type = m.type;
data = m.data;
m.type = moved;
m.data = nullptr;
return *this;
}
~MatGFP() ~MatGFP()
{
if (type == root)
{ {
CUDA_CHECK(cudaFree(data)); if (type == root)
}
}
inline gfp_t *at_base(size_t r, size_t w) const
{
return data + r * rowstride + w;
}
void randomize(uint_fast32_t seed)
{
assert(type == root);
static default_random_engine e(seed);
static uniform_int_distribution<gfp_t> d;
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{ {
*at_base(r, w) = d(e) % gfp; CUDA_CHECK(cudaFree(data));
} }
} }
}
// 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列 inline gfp_t *at_base(size_t r, size_t w) const
void randomize(size_t rank_col, uint_fast32_t seed)
{
assert(nrows <= rank_col && rank_col <= ncols);
randomize(seed);
vector<size_t> pivot(rank_col);
iota(pivot.begin(), pivot.end(), 0);
random_shuffle(pivot.begin(), pivot.end());
pivot.resize(nrows);
sort(pivot.begin(), pivot.end());
vector<bool> pivotmask(width, true);
for (size_t r = 0; r < nrows; r++)
{ {
pivotmask[pivot[r]] = false; return data + r * rowstride + w;
} }
for (size_t r = 0; r < nrows; r++) void randomize(uint_fast32_t seed)
{ {
for (size_t w = 0; w < pivot[r]; w++) assert(type == root);
static default_random_engine e(seed);
static uniform_int_distribution<gfp_t> d;
for (size_t r = 0; r < nrows; r++)
{ {
*at_base(r, w) = base_zero; for (size_t w = 0; w < width; w++)
{
*at_base(r, w) = d(e) % gfprime;
}
} }
*at_base(r, pivot[r]) = base_one; }
for (size_t w = pivot[r] + 1; w < rank_col; w++)
// 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列
void randomize(size_t rank_col, uint_fast32_t seed)
{
assert(nrows <= rank_col && rank_col <= ncols);
randomize(seed);
vector<size_t> pivot(rank_col);
iota(pivot.begin(), pivot.end(), 0);
random_shuffle(pivot.begin(), pivot.end());
pivot.resize(nrows);
sort(pivot.begin(), pivot.end());
vector<bool> pivotmask(width, true);
for (size_t r = 0; r < nrows; r++)
{ {
if (!pivotmask[w]) pivotmask[pivot[r]] = false;
}
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < pivot[r]; w++)
{ {
*at_base(r, w) = base_zero; *at_base(r, w) = base_zero;
} }
} *at_base(r, pivot[r]) = base_one;
} for (size_t w = pivot[r] + 1; w < rank_col; w++)
}
bool operator==(const MatGFP &m) const
{
if (nrows != m.nrows || ncols != m.ncols)
{
return false;
}
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{
if (*at_base(r, w) != *m.at_base(r, w))
{ {
return false; if (!pivotmask[w])
{
*at_base(r, w) = base_zero;
}
} }
} }
} }
return true;
}
bool operator==(const gfp_t base) const bool operator==(const MatGFP &m) const
{
for (size_t r = 0; r < nrows; r++)
{ {
for (size_t w = 0; w < width; w++) if (nrows != m.nrows || ncols != m.ncols)
{ {
if (*at_base(r, w) != base) return false;
}
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{ {
return false; if (*at_base(r, w) != *m.at_base(r, w))
{
return false;
}
}
}
return true;
}
bool operator==(const gfp_t base) const
{
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{
if (*at_base(r, w) != base)
{
return false;
}
}
}
return true;
}
void operator+=(const MatGFP &m)
{
assert(nrows == m.nrows && ncols == m.ncols);
for (size_t r = 0; r < nrows; r++)
{
for (size_t w = 0; w < width; w++)
{
*at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfprime;
} }
} }
} }
return true; MatGFP operator+(const MatGFP &m) const
}
void operator+=(const MatGFP &m)
{
assert(nrows == m.nrows && ncols == m.ncols);
for (size_t r = 0; r < nrows; r++)
{ {
for (size_t w = 0; w < width; w++) MatGFP temp(*this);
{ temp += m;
*at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfp; return temp;
}
} }
} // a(m*k)*b(k,n) 其中k不超过65536
MatGFP operator+(const MatGFP &m) const void cpu_addmul(const MatGFP &a, const MatGFP &b)
{
MatGFP temp(*this);
temp += m;
return temp;
}
// a(m*k)*b(k,n) 其中k不超过65536
void cpu_addmul(const MatGFP &a, const MatGFP &b)
{
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
assert(a.ncols <= 65536);
for (size_t r = 0; r < nrows; r++)
{ {
for (size_t w = 0; w < width; w++) assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
assert(a.ncols <= 65536);
for (size_t r = 0; r < nrows; r++)
{ {
for (size_t i = 0; i < a.ncols; i++) for (size_t w = 0; w < width; w++)
{ {
*at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfp; for (size_t i = 0; i < a.ncols; i++)
{
*at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfprime;
}
*at_base(r, w) %= gfprime;
} }
*at_base(r, w) %= gfp;
} }
} }
} void gpu_mul(const MatGFP &a, const MatGFP &b);
void gpu_mul(const MatGFP &a, const MatGFP &b);
MatGFP operator*(const MatGFP &m) const MatGFP operator*(const MatGFP &m) const
{
MatGFP temp(nrows, m.width);
temp.gpu_mul(*this, m);
return temp;
}
// void cpu_swap_row(size_t r1, size_t r2);
// ElimResult gpu_elim();
friend ostream &operator<<(ostream &out, const MatGFP &m);
size_t nrows, ncols, width;
private:
MatGFP() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
size_t rowstride;
MatType type;
gfp_t *data;
};
ostream &operator<<(ostream &out, const MatGFP &m)
{
for (size_t r = 0; r < m.nrows; r++)
{
for (size_t w = 0; w < m.width; w++)
{ {
#if gfp_bits == 64 MatGFP temp(nrows, m.width);
printf("%05lu ", *m.at_base(r, w)); temp.gpu_mul(*this, m);
#else return temp;
printf("%05u ", *m.at_base(r, w));
#endif
} }
printf("\n");
// void cpu_swap_row(size_t r1, size_t r2);
// ElimResult gpu_elim();
friend ostream &operator<<(ostream &out, const MatGFP &m);
size_t nrows, ncols, width;
private:
MatGFP() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
size_t rowstride;
MatType type;
gfp_t *data;
};
ostream &operator<<(ostream &out, const MatGFP &m)
{
for (size_t r = 0; r < m.nrows; r++)
{
for (size_t w = 0; w < m.width; w++)
{
printf("%05u ", *m.at_base(r, w));
}
printf("\n");
}
return out;
} }
return out;
} }
#endif #endif

View File

@ -3,90 +3,83 @@
#include "gfp_mat.cuh" #include "gfp_mat.cuh"
static const int BlockRow = 128, BlockCol = 128; // 每个block处理c矩阵的一个子块 namespace gfp
static const int StepSize = 8; // block中一个循环处理的A矩阵的列数B矩阵的行数
static_assert(BlockCol % THREAD_X == 0 && BlockRow % THREAD_Y == 0);
__global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps)
{ {
const unsigned int bx = blockIdx.x; static const int BlockRow = 128, BlockCol = 128; // 每个block处理c矩阵的一个子块
const unsigned int by = blockIdx.y; static const int StepSize = 8; // block中一个循环处理的A矩阵的列数B矩阵的行数
const unsigned int tx = threadIdx.x;
const unsigned int ty = threadIdx.y;
const unsigned int tid = ty * blockDim.x + tx;
#if gfp_bits == 64 static_assert(BlockCol % THREAD_X == 0 && BlockRow % THREAD_Y == 0);
__shared__ alignas(8) gfp_t s_a[StepSize][BlockRow];
__shared__ alignas(8) gfp_t s_b[StepSize][BlockCol];
#else
__shared__ gfp_t s_a[StepSize][BlockRow];
__shared__ gfp_t s_b[StepSize][BlockCol];
#endif
gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0}; __global__ void gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps)
for (int s = 0; s < (nsteps - 1) / StepSize + 1; s++)
{ {
for (int k = tid; k < StepSize * BlockRow; k += blockDim.x * blockDim.y)
const unsigned int bx = blockIdx.x;
const unsigned int by = blockIdx.y;
const unsigned int tx = threadIdx.x;
const unsigned int ty = threadIdx.y;
const unsigned int tid = ty * blockDim.x + tx;
__shared__ gfp_t s_a[StepSize][BlockRow];
__shared__ gfp_t s_b[StepSize][BlockCol];
gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0};
for (int s = 0; s < (nsteps - 1) / StepSize + 1; s++)
{ {
const int a_r = k / StepSize; for (int k = tid; k < StepSize * BlockRow; k += blockDim.x * blockDim.y)
const int a_c = k % StepSize;
s_a[a_c][a_r] = by * BlockRow + a_r < nrows && s * StepSize + a_c < nsteps ? *at_base(a, a_rs, by * BlockRow + a_r, s * StepSize + a_c) : 0;
const int b_r = k / BlockCol;
const int b_c = k % BlockCol;
s_b[b_r][b_c] = s * StepSize + b_r < nsteps && bx * BlockCol + b_c < ncols ? *at_base(b, b_rs, s * StepSize + b_r, bx * BlockCol + b_c) : 0;
}
__syncthreads();
for (int k = 0; k < StepSize; k++)
{
for (int j = 0; j < BlockRow / THREAD_Y; j++)
{ {
for (int i = 0; i < BlockCol / THREAD_X; i++) const int a_r = k / StepSize;
const int a_c = k % StepSize;
s_a[a_c][a_r] = by * BlockRow + a_r < nrows && s * StepSize + a_c < nsteps ? *at_base(a, a_rs, by * BlockRow + a_r, s * StepSize + a_c) : 0;
const int b_r = k / BlockCol;
const int b_c = k % BlockCol;
s_b[b_r][b_c] = s * StepSize + b_r < nsteps && bx * BlockCol + b_c < ncols ? *at_base(b, b_rs, s * StepSize + b_r, bx * BlockCol + b_c) : 0;
}
__syncthreads();
for (int k = 0; k < StepSize; k++)
{
for (int j = 0; j < BlockRow / THREAD_Y; j++)
{ {
#if gfp_bits == 64 for (int i = 0; i < BlockCol / THREAD_X; i++)
tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]); {
#else tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfprime;
tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfp; }
#endif }
}
__syncthreads();
if (s & gfp_fullmask == gfp_fullmask)
{
for (int j = 0; j < BlockRow / THREAD_Y; j++)
{
for (int i = 0; i < BlockCol / THREAD_X; i++)
{
tmp_c[j][i] %= gfprime;
}
} }
} }
} }
__syncthreads(); for (int j = 0; j < BlockRow / THREAD_Y; j++)
#if gfp_bits != 64
if (s & gfp_fullmask == gfp_fullmask)
{ {
for (int j = 0; j < BlockRow / THREAD_Y; j++) for (int i = 0; i < BlockCol / THREAD_X; i++)
{ {
for (int i = 0; i < BlockCol / THREAD_X; i++) if (by * BlockRow + j * THREAD_Y + ty < nrows && bx * BlockCol + i * THREAD_X + tx < ncols)
{ {
tmp_c[j][i] %= gfp; *at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfprime;
} }
} }
} }
#endif
} }
for (int j = 0; j < BlockRow / THREAD_Y; j++)
__host__ void MatGFP::gpu_mul(const MatGFP &a, const MatGFP &b)
{ {
for (int i = 0; i < BlockCol / THREAD_X; i++) assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
{
if (by * BlockRow + j * THREAD_Y + ty < nrows && bx * BlockCol + i * THREAD_X + tx < ncols) dim3 block(THREAD_X, THREAD_Y);
{ dim3 grid((width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
*at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfp; gpu_mul_kernel<<<grid, block>>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width);
} cudaDeviceSynchronize();
}
} }
} }
__host__ void MatGFP::gpu_mul(const MatGFP &a, const MatGFP &b)
{
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
dim3 block(THREAD_X, THREAD_Y);
dim3 grid((width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
gfp_gpu_mul_kernel<<<grid, block>>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width);
cudaDeviceSynchronize();
}
#endif #endif

View File

@ -6,14 +6,6 @@
#include <cpp_progress.hpp> #include <cpp_progress.hpp>
// matrix
// #include <map>
// #include <vector>
// #include <algorithm>
// #include <numeric>
// #include <omp.h>
using namespace std; using namespace std;
using base_t = uint64_t; using base_t = uint64_t;
@ -25,13 +17,8 @@ static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01;
static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF; static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;
static const size_t THREAD_X = 32; // 列 static const size_t THREAD_X = 16; // 列
static const size_t THREAD_Y = 8; // 行 static const size_t THREAD_Y = 16; // 行
// __host__ __device__ base_t *at_base(base_t *base, size_t rowstride, size_t r, size_t w)
// {
// return base + r * rowstride + w;
// }
template <typename T> template <typename T>
__host__ __device__ T *at_base(T *base, size_t rowstride, size_t r, size_t w) __host__ __device__ T *at_base(T *base, size_t rowstride, size_t r, size_t w)

View File

@ -4,6 +4,8 @@
#undef SHOW_PROGRESS_BAR #undef SHOW_PROGRESS_BAR
using namespace gfp;
int main() int main()
{ {
int m = 1000, k = 1000, n = 1000; int m = 1000, k = 1000, n = 1000;

View File

@ -1,6 +1,8 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "test_header.cuh" #include "test_header.cuh"
using namespace gf256;
bool test_gf256_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, const GF256 &gf256, uint_fast32_t seed) bool test_gf256_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, const GF256 &gf256, uint_fast32_t seed)
{ {
assert(rank <= nrows && rank <= rank_col && rank_col <= ncols); assert(rank <= nrows && rank <= rank_col && rank_col <= ncols);

View File

@ -1,6 +1,8 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "test_header.cuh" #include "test_header.cuh"
using namespace gf256;
vector<gf256_t> expect_inv_table{ vector<gf256_t> expect_inv_table{
0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96, 0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96,
0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A, 0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A,

View File

@ -1,6 +1,8 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "test_header.cuh" #include "test_header.cuh"
using namespace gf256;
TEST(TestGF256Matrix, Equal) TEST(TestGF256Matrix, Equal)
{ {
MatGF256 a(50, 50); MatGF256 a(50, 50);

View File

@ -1,6 +1,8 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "test_header.cuh" #include "test_header.cuh"
using namespace gfp;
bool test_gfp_mul(size_t m, size_t k, size_t n, uint_fast32_t seed) bool test_gfp_mul(size_t m, size_t k, size_t n, uint_fast32_t seed)
{ {
MatGFP a(m, k); MatGFP a(m, k);