使用命名空间进行划分
This commit is contained in:
parent
2375705792
commit
e73b158a37
@ -1,6 +1,8 @@
|
|||||||
#include <benchmark/benchmark.h>
|
#include <benchmark/benchmark.h>
|
||||||
#include "cuelim.cuh"
|
#include "cuelim.cuh"
|
||||||
|
|
||||||
|
using namespace gf256;
|
||||||
|
|
||||||
template <MatGF256 (*GpuFunc)(const MatGF256 &, const MatGF256 &, const GF256 &)>
|
template <MatGF256 (*GpuFunc)(const MatGF256 &, const MatGF256 &, const GF256 &)>
|
||||||
void bench_gf256_mul(benchmark::State &state)
|
void bench_gf256_mul(benchmark::State &state)
|
||||||
{
|
{
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include <benchmark/benchmark.h>
|
#include <benchmark/benchmark.h>
|
||||||
#include "test_header.cuh"
|
#include "test_header.cuh"
|
||||||
|
|
||||||
|
using namespace gfp;
|
||||||
|
|
||||||
static void bench_gfp(benchmark::State &state)
|
static void bench_gfp(benchmark::State &state)
|
||||||
{
|
{
|
||||||
uint_fast32_t seed = 41921095;
|
uint_fast32_t seed = 41921095;
|
||||||
|
@ -3,8 +3,10 @@
|
|||||||
|
|
||||||
#include "gf256_mat.cuh"
|
#include "gf256_mat.cuh"
|
||||||
|
|
||||||
void MatGF256::cpu_swap_row(size_t r1, size_t r2)
|
namespace gf256
|
||||||
{
|
{
|
||||||
|
void MatGF256::cpu_swap_row(size_t r1, size_t r2)
|
||||||
|
{
|
||||||
if (r1 == r2)
|
if (r1 == r2)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
@ -17,10 +19,10 @@ void MatGF256::cpu_swap_row(size_t r1, size_t r2)
|
|||||||
p1[i] = p2[i];
|
p1[i] = p2[i];
|
||||||
p2[i] = temp;
|
p2[i] = temp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t gf256_cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, const GF256 &gf)
|
size_t cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, const GF256 &gf)
|
||||||
{
|
{
|
||||||
size_t rank = 0;
|
size_t rank = 0;
|
||||||
size_t pivot[gf256_num];
|
size_t pivot[gf256_num];
|
||||||
size_t next[gf256_num];
|
size_t next[gf256_num];
|
||||||
@ -56,10 +58,10 @@ size_t gf256_cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rank;
|
return rank;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void gf256_gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width)
|
__global__ void gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width)
|
||||||
{
|
{
|
||||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
if (w >= width)
|
if (w >= width)
|
||||||
{
|
{
|
||||||
@ -90,10 +92,10 @@ __global__ void gf256_gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *
|
|||||||
{
|
{
|
||||||
*at_base(src, s_rowstride, r, w) = temp[r];
|
*at_base(src, s_rowstride, r, w) = temp[r];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void gf256_gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows)
|
__global__ void gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows)
|
||||||
{
|
{
|
||||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
@ -109,12 +111,12 @@ __global__ void gf256_gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstri
|
|||||||
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, get8(pivot_base, i)), w);
|
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, get8(pivot_base, i)), w);
|
||||||
}
|
}
|
||||||
*at_base(data, rowstride, r, w) ^= temp;
|
*at_base(data, rowstride, r, w) ^= temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
__managed__ base_t spL[gf256_num];
|
__managed__ base_t spL[gf256_num];
|
||||||
|
|
||||||
__host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
|
__host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
|
||||||
{
|
{
|
||||||
gf.cpy_to_constant();
|
gf.cpy_to_constant();
|
||||||
MatGF256 tb(gf256_num * (1 << gf256_len), ncols);
|
MatGF256 tb(gf256_num * (1 << gf256_len), ncols);
|
||||||
|
|
||||||
@ -131,7 +133,7 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
|
|||||||
{
|
{
|
||||||
CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault));
|
CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault));
|
||||||
|
|
||||||
size_t src_rank = gf256_cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf);
|
size_t src_rank = cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf);
|
||||||
|
|
||||||
if (src_rank == 0)
|
if (src_rank == 0)
|
||||||
{
|
{
|
||||||
@ -162,7 +164,7 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
|
|||||||
|
|
||||||
dim3 block_src(THREAD_X);
|
dim3 block_src(THREAD_X);
|
||||||
dim3 grid_src((width - w - 1) / block_src.x + 1);
|
dim3 grid_src((width - w - 1) / block_src.x + 1);
|
||||||
gf256_gpu_mksrc_kernel<<<grid_src, block_src>>>(at_base(rank, w), rowstride, spL, src_rank, width);
|
gpu_mksrc_kernel<<<grid_src, block_src>>>(at_base(rank, w), rowstride, spL, src_rank, width);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
dim3 block_tb(THREAD_X, THREAD_Y);
|
dim3 block_tb(THREAD_X, THREAD_Y);
|
||||||
@ -174,7 +176,7 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
|
|||||||
|
|
||||||
dim3 block(THREAD_X, THREAD_Y);
|
dim3 block(THREAD_X, THREAD_Y);
|
||||||
dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
||||||
gf256_gpu_elim_kernel<<<grid, block>>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows);
|
gpu_elim_kernel<<<grid, block>>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
rank += src_rank;
|
rank += src_rank;
|
||||||
@ -187,6 +189,7 @@ __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf)
|
|||||||
cudaFree(base_col);
|
cudaFree(base_col);
|
||||||
cudaFree(idx);
|
cudaFree(idx);
|
||||||
return {rank, p_col, p_row};
|
return {rank, p_col, p_row};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
@ -4,16 +4,18 @@
|
|||||||
#include "../header.cuh"
|
#include "../header.cuh"
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
using gf256_t = uint8_t;
|
namespace gf256
|
||||||
|
{
|
||||||
|
using gf256_t = uint8_t;
|
||||||
|
|
||||||
static const size_t gf256_len = sizeof(gf256_t) * 8;
|
static const size_t gf256_len = sizeof(gf256_t) * 8;
|
||||||
static const size_t gf256_num = base_len / gf256_len;
|
static const size_t gf256_num = base_len / gf256_len;
|
||||||
|
|
||||||
static const gf256_t gf256_zero = (gf256_t)0x00;
|
static const gf256_t gf256_zero = (gf256_t)0x00;
|
||||||
static const gf256_t gf256_one = (gf256_t)0x01;
|
static const gf256_t gf256_one = (gf256_t)0x01;
|
||||||
static const gf256_t gf256_fullmask = (gf256_t)0xFF;
|
static const gf256_t gf256_fullmask = (gf256_t)0xFF;
|
||||||
|
|
||||||
static const base_t gf256_mask[8] = {
|
static const base_t gf256_mask[8] = {
|
||||||
(base_t)0x00'00'00'00'00'00'00'FF,
|
(base_t)0x00'00'00'00'00'00'00'FF,
|
||||||
(base_t)0x00'00'00'00'00'00'FF'00,
|
(base_t)0x00'00'00'00'00'00'FF'00,
|
||||||
(base_t)0x00'00'00'00'00'FF'00'00,
|
(base_t)0x00'00'00'00'00'FF'00'00,
|
||||||
@ -23,29 +25,29 @@ static const base_t gf256_mask[8] = {
|
|||||||
(base_t)0x00'FF'00'00'00'00'00'00,
|
(base_t)0x00'FF'00'00'00'00'00'00,
|
||||||
(base_t)0xFF'00'00'00'00'00'00'00};
|
(base_t)0xFF'00'00'00'00'00'00'00};
|
||||||
|
|
||||||
__host__ __device__ inline size_t offset8(size_t idx)
|
__host__ __device__ inline size_t offset8(size_t idx)
|
||||||
{
|
{
|
||||||
return idx << 3;
|
return idx << 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ __device__ inline gf256_t get8(base_t src, size_t idx)
|
__host__ __device__ inline gf256_t get8(base_t src, size_t idx)
|
||||||
{
|
{
|
||||||
return (gf256_t)(src >> offset8(idx));
|
return (gf256_t)(src >> offset8(idx));
|
||||||
}
|
}
|
||||||
|
|
||||||
// 确保set8对应位置的值为0
|
// 确保set8对应位置的值为0
|
||||||
__host__ __device__ inline void set8(base_t &dst, gf256_t src, size_t idx)
|
__host__ __device__ inline void set8(base_t &dst, gf256_t src, size_t idx)
|
||||||
{
|
{
|
||||||
dst |= (base_t)src << offset8(idx);
|
dst |= (base_t)src << offset8(idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ inline void del8(base_t &dst, size_t idx)
|
__host__ inline void del8(base_t &dst, size_t idx)
|
||||||
{
|
{
|
||||||
dst &= ~gf256_mask[idx];
|
dst &= ~gf256_mask[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ inline base_t concat8(base_t dst_l, size_t idx_l, base_t dst_r)
|
__host__ inline base_t concat8(base_t dst_l, size_t idx_l, base_t dst_r)
|
||||||
{
|
{
|
||||||
if (idx_l == 0)
|
if (idx_l == 0)
|
||||||
{
|
{
|
||||||
return dst_r;
|
return dst_r;
|
||||||
@ -55,19 +57,19 @@ __host__ inline base_t concat8(base_t dst_l, size_t idx_l, base_t dst_r)
|
|||||||
return dst_l;
|
return dst_l;
|
||||||
}
|
}
|
||||||
return (dst_l & (base_fullmask >> (base_len - offset8(idx_l)))) | (dst_r & (base_fullmask << offset8(idx_l)));
|
return (dst_l & (base_fullmask >> (base_len - offset8(idx_l)))) | (dst_r & (base_fullmask << offset8(idx_l)));
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ inline base_t rev8(base_t n)
|
__host__ inline base_t rev8(base_t n)
|
||||||
{
|
{
|
||||||
n = (n & (base_t)0xFF'00'FF'00'FF'00'FF'00) >> 8 | (n & (base_t)0x00'FF'00'FF'00'FF'00'FF) << 8;
|
n = (n & (base_t)0xFF'00'FF'00'FF'00'FF'00) >> 8 | (n & (base_t)0x00'FF'00'FF'00'FF'00'FF) << 8;
|
||||||
n = (n & (base_t)0xFF'FF'00'00'FF'FF'00'00) >> 16 | (n & (base_t)0x00'00'FF'FF'00'00'FF'FF) << 16;
|
n = (n & (base_t)0xFF'FF'00'00'FF'FF'00'00) >> 16 | (n & (base_t)0x00'00'FF'FF'00'00'FF'FF) << 16;
|
||||||
return n >> 32 | n << 32;
|
return n >> 32 | n << 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
__constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len];
|
__constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len];
|
||||||
|
|
||||||
__device__ inline base_t mul_base(const gf256_t val, const base_t base)
|
__device__ inline base_t mul_base(const gf256_t val, const base_t base)
|
||||||
{
|
{
|
||||||
if (val == 0)
|
if (val == 0)
|
||||||
{
|
{
|
||||||
return base_zero;
|
return base_zero;
|
||||||
@ -78,10 +80,10 @@ __device__ inline base_t mul_base(const gf256_t val, const base_t base)
|
|||||||
set8(temp, d_mul_table[val][get8(base, i)], i);
|
set8(temp, d_mul_table[val][get8(base, i)], i);
|
||||||
}
|
}
|
||||||
return temp;
|
return temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width)
|
__global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width)
|
||||||
{
|
{
|
||||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
@ -94,13 +96,13 @@ __global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *src, si
|
|||||||
base_t s = *at_base(src, s_rowstride, get8(r, 1), w);
|
base_t s = *at_base(src, s_rowstride, get8(r, 1), w);
|
||||||
base_t d = mul_base(val, s);
|
base_t d = mul_base(val, s);
|
||||||
*at_base(tb, tb_rowstride, r, w) = d;
|
*at_base(tb, tb_rowstride, r, w) = d;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const set<base_t> irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9};
|
static const set<base_t> irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9};
|
||||||
|
|
||||||
class GF256
|
class GF256
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
GF256(base_t poly)
|
GF256(base_t poly)
|
||||||
{
|
{
|
||||||
assert(irreducible_polynomials_degree_08.count(poly) == 1);
|
assert(irreducible_polynomials_degree_08.count(poly) == 1);
|
||||||
@ -157,7 +159,7 @@ public:
|
|||||||
GF256 &operator=(const GF256 &) = delete;
|
GF256 &operator=(const GF256 &) = delete;
|
||||||
GF256 &operator=(GF256 &&) = delete;
|
GF256 &operator=(GF256 &&) = delete;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
gf256_t shift_left(gf256_t x, size_t d)
|
gf256_t shift_left(gf256_t x, size_t d)
|
||||||
{
|
{
|
||||||
base_t temp = (base_t)x << d;
|
base_t temp = (base_t)x << d;
|
||||||
@ -174,10 +176,10 @@ private:
|
|||||||
base_t poly;
|
base_t poly;
|
||||||
gf256_t inv_table[1 << gf256_num];
|
gf256_t inv_table[1 << gf256_num];
|
||||||
gf256_t mul_table[1 << gf256_num][1 << gf256_num];
|
gf256_t mul_table[1 << gf256_num][1 << gf256_num];
|
||||||
};
|
};
|
||||||
|
|
||||||
ostream &operator<<(ostream &out, const GF256 &gf)
|
ostream &operator<<(ostream &out, const GF256 &gf)
|
||||||
{
|
{
|
||||||
for (size_t x = 0; x < 1 << gf256_len; x++)
|
for (size_t x = 0; x < 1 << gf256_len; x++)
|
||||||
{
|
{
|
||||||
for (size_t y = 0; y < 1 << gf256_len; y++)
|
for (size_t y = 0; y < 1 << gf256_len; y++)
|
||||||
@ -187,6 +189,6 @@ ostream &operator<<(ostream &out, const GF256 &gf)
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
@ -7,16 +7,18 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
struct ElimResult
|
namespace gf256
|
||||||
{
|
{
|
||||||
|
struct ElimResult
|
||||||
|
{
|
||||||
size_t rank;
|
size_t rank;
|
||||||
vector<size_t> pivot;
|
vector<size_t> pivot;
|
||||||
vector<size_t> swap_row;
|
vector<size_t> swap_row;
|
||||||
};
|
};
|
||||||
|
|
||||||
class MatGF256
|
class MatGF256
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
enum MatType
|
enum MatType
|
||||||
{
|
{
|
||||||
root,
|
root,
|
||||||
@ -202,16 +204,16 @@ public:
|
|||||||
|
|
||||||
size_t nrows, ncols, width;
|
size_t nrows, ncols, width;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
|
MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
|
||||||
|
|
||||||
size_t rowstride;
|
size_t rowstride;
|
||||||
MatType type;
|
MatType type;
|
||||||
base_t *data;
|
base_t *data;
|
||||||
};
|
};
|
||||||
|
|
||||||
ostream &operator<<(ostream &out, const MatGF256 &m)
|
ostream &operator<<(ostream &out, const MatGF256 &m)
|
||||||
{
|
{
|
||||||
for (size_t r = 0; r < m.nrows; r++)
|
for (size_t r = 0; r < m.nrows; r++)
|
||||||
{
|
{
|
||||||
for (size_t w = 0; w < m.width; w++)
|
for (size_t w = 0; w < m.width; w++)
|
||||||
@ -221,6 +223,7 @@ ostream &operator<<(ostream &out, const MatGF256 &m)
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
@ -3,8 +3,10 @@
|
|||||||
|
|
||||||
#include "gf256_mat.cuh"
|
#include "gf256_mat.cuh"
|
||||||
|
|
||||||
__global__ void gf256_gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows)
|
namespace gf256
|
||||||
{
|
{
|
||||||
|
__global__ void gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows)
|
||||||
|
{
|
||||||
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
@ -20,10 +22,10 @@ __global__ void gf256_gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *t
|
|||||||
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w);
|
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w);
|
||||||
}
|
}
|
||||||
*at_base(c, c_rowstride, r, w) ^= temp;
|
*at_base(c, c_rowstride, r, w) ^= temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
|
__host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
|
||||||
{
|
{
|
||||||
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
||||||
gf.cpy_to_constant();
|
gf.cpy_to_constant();
|
||||||
MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols);
|
MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols);
|
||||||
@ -40,17 +42,18 @@ __host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const G
|
|||||||
|
|
||||||
dim3 block(THREAD_X, THREAD_Y);
|
dim3 block(THREAD_X, THREAD_Y);
|
||||||
dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
||||||
gf256_gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows);
|
gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
|
__host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf)
|
||||||
{
|
{
|
||||||
assert(a.ncols == b.nrows);
|
assert(a.ncols == b.nrows);
|
||||||
MatGF256 c(a.nrows, b.ncols);
|
MatGF256 c(a.nrows, b.ncols);
|
||||||
c.gpu_addmul(a, b, gf);
|
c.gpu_addmul(a, b, gf);
|
||||||
return c;
|
return c;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -3,25 +3,25 @@
|
|||||||
|
|
||||||
#include "../header.cuh"
|
#include "../header.cuh"
|
||||||
|
|
||||||
using gfp_t = uint32_t;
|
namespace gfp
|
||||||
#define gfp_bits 32
|
|
||||||
|
|
||||||
static_assert(sizeof(gfp_t) * 8 == gfp_bits);
|
|
||||||
|
|
||||||
static const gfp_t gfp = 65521;
|
|
||||||
|
|
||||||
static const gfp_t gfp_zero = (gfp_t)0;
|
|
||||||
static const gfp_t gfp_one = (gfp_t)1;
|
|
||||||
static const gfp_t gfp_fullmask = (gfp_t)0xFF'FF;
|
|
||||||
|
|
||||||
__managed__ gfp_t gfp_inv_table[gfp];
|
|
||||||
|
|
||||||
void init_inv_table()
|
|
||||||
{
|
{
|
||||||
|
using gfp_t = uint32_t;
|
||||||
|
|
||||||
|
static const gfp_t gfprime = 65521;
|
||||||
|
|
||||||
|
static const gfp_t gfp_zero = (gfp_t)0;
|
||||||
|
static const gfp_t gfp_one = (gfp_t)1;
|
||||||
|
static const gfp_t gfp_fullmask = (gfp_t)0xFF'FF;
|
||||||
|
|
||||||
|
__managed__ gfp_t gfp_inv_table[gfprime];
|
||||||
|
|
||||||
|
void init_inv_table()
|
||||||
|
{
|
||||||
gfp_inv_table[0] = 0;
|
gfp_inv_table[0] = 0;
|
||||||
gfp_inv_table[1] = 1;
|
gfp_inv_table[1] = 1;
|
||||||
for (int i = 2; i < gfp; ++i)
|
for (int i = 2; i < gfprime; ++i)
|
||||||
gfp_inv_table[i] = (gfp - gfp / i) * gfp_inv_table[gfp % i] % gfp;
|
gfp_inv_table[i] = (gfprime - gfprime / i) * gfp_inv_table[gfprime % i] % gfprime;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
@ -7,9 +7,11 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
class MatGFP
|
namespace gfp
|
||||||
{
|
{
|
||||||
public:
|
class MatGFP
|
||||||
|
{
|
||||||
|
public:
|
||||||
enum MatType
|
enum MatType
|
||||||
{
|
{
|
||||||
root,
|
root,
|
||||||
@ -92,7 +94,7 @@ public:
|
|||||||
{
|
{
|
||||||
for (size_t w = 0; w < width; w++)
|
for (size_t w = 0; w < width; w++)
|
||||||
{
|
{
|
||||||
*at_base(r, w) = d(e) % gfp;
|
*at_base(r, w) = d(e) % gfprime;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -171,7 +173,7 @@ public:
|
|||||||
{
|
{
|
||||||
for (size_t w = 0; w < width; w++)
|
for (size_t w = 0; w < width; w++)
|
||||||
{
|
{
|
||||||
*at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfp;
|
*at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfprime;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -192,9 +194,9 @@ public:
|
|||||||
{
|
{
|
||||||
for (size_t i = 0; i < a.ncols; i++)
|
for (size_t i = 0; i < a.ncols; i++)
|
||||||
{
|
{
|
||||||
*at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfp;
|
*at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfprime;
|
||||||
}
|
}
|
||||||
*at_base(r, w) %= gfp;
|
*at_base(r, w) %= gfprime;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -214,29 +216,26 @@ public:
|
|||||||
|
|
||||||
size_t nrows, ncols, width;
|
size_t nrows, ncols, width;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MatGFP() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
|
MatGFP() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
|
||||||
|
|
||||||
size_t rowstride;
|
size_t rowstride;
|
||||||
MatType type;
|
MatType type;
|
||||||
gfp_t *data;
|
gfp_t *data;
|
||||||
};
|
};
|
||||||
|
|
||||||
ostream &operator<<(ostream &out, const MatGFP &m)
|
ostream &operator<<(ostream &out, const MatGFP &m)
|
||||||
{
|
{
|
||||||
for (size_t r = 0; r < m.nrows; r++)
|
for (size_t r = 0; r < m.nrows; r++)
|
||||||
{
|
{
|
||||||
for (size_t w = 0; w < m.width; w++)
|
for (size_t w = 0; w < m.width; w++)
|
||||||
{
|
{
|
||||||
#if gfp_bits == 64
|
|
||||||
printf("%05lu ", *m.at_base(r, w));
|
|
||||||
#else
|
|
||||||
printf("%05u ", *m.at_base(r, w));
|
printf("%05u ", *m.at_base(r, w));
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
@ -3,27 +3,25 @@
|
|||||||
|
|
||||||
#include "gfp_mat.cuh"
|
#include "gfp_mat.cuh"
|
||||||
|
|
||||||
static const int BlockRow = 128, BlockCol = 128; // 每个block处理c矩阵的一个子块
|
namespace gfp
|
||||||
static const int StepSize = 8; // block中一个循环处理的A矩阵的列数(B矩阵的行数)
|
|
||||||
|
|
||||||
static_assert(BlockCol % THREAD_X == 0 && BlockRow % THREAD_Y == 0);
|
|
||||||
|
|
||||||
__global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
|
static const int BlockRow = 128, BlockCol = 128; // 每个block处理c矩阵的一个子块
|
||||||
|
static const int StepSize = 8; // block中一个循环处理的A矩阵的列数(B矩阵的行数)
|
||||||
|
|
||||||
|
static_assert(BlockCol % THREAD_X == 0 && BlockRow % THREAD_Y == 0);
|
||||||
|
|
||||||
|
__global__ void gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps)
|
||||||
|
{
|
||||||
|
|
||||||
const unsigned int bx = blockIdx.x;
|
const unsigned int bx = blockIdx.x;
|
||||||
const unsigned int by = blockIdx.y;
|
const unsigned int by = blockIdx.y;
|
||||||
const unsigned int tx = threadIdx.x;
|
const unsigned int tx = threadIdx.x;
|
||||||
const unsigned int ty = threadIdx.y;
|
const unsigned int ty = threadIdx.y;
|
||||||
const unsigned int tid = ty * blockDim.x + tx;
|
const unsigned int tid = ty * blockDim.x + tx;
|
||||||
|
|
||||||
#if gfp_bits == 64
|
|
||||||
__shared__ alignas(8) gfp_t s_a[StepSize][BlockRow];
|
|
||||||
__shared__ alignas(8) gfp_t s_b[StepSize][BlockCol];
|
|
||||||
#else
|
|
||||||
__shared__ gfp_t s_a[StepSize][BlockRow];
|
__shared__ gfp_t s_a[StepSize][BlockRow];
|
||||||
__shared__ gfp_t s_b[StepSize][BlockCol];
|
__shared__ gfp_t s_b[StepSize][BlockCol];
|
||||||
#endif
|
|
||||||
|
|
||||||
gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0};
|
gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0};
|
||||||
|
|
||||||
@ -45,27 +43,21 @@ __global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp
|
|||||||
{
|
{
|
||||||
for (int i = 0; i < BlockCol / THREAD_X; i++)
|
for (int i = 0; i < BlockCol / THREAD_X; i++)
|
||||||
{
|
{
|
||||||
#if gfp_bits == 64
|
tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfprime;
|
||||||
tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]);
|
|
||||||
#else
|
|
||||||
tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfp;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#if gfp_bits != 64
|
|
||||||
if (s & gfp_fullmask == gfp_fullmask)
|
if (s & gfp_fullmask == gfp_fullmask)
|
||||||
{
|
{
|
||||||
for (int j = 0; j < BlockRow / THREAD_Y; j++)
|
for (int j = 0; j < BlockRow / THREAD_Y; j++)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < BlockCol / THREAD_X; i++)
|
for (int i = 0; i < BlockCol / THREAD_X; i++)
|
||||||
{
|
{
|
||||||
tmp_c[j][i] %= gfp;
|
tmp_c[j][i] %= gfprime;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
for (int j = 0; j < BlockRow / THREAD_Y; j++)
|
for (int j = 0; j < BlockRow / THREAD_Y; j++)
|
||||||
{
|
{
|
||||||
@ -73,20 +65,21 @@ __global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp
|
|||||||
{
|
{
|
||||||
if (by * BlockRow + j * THREAD_Y + ty < nrows && bx * BlockCol + i * THREAD_X + tx < ncols)
|
if (by * BlockRow + j * THREAD_Y + ty < nrows && bx * BlockCol + i * THREAD_X + tx < ncols)
|
||||||
{
|
{
|
||||||
*at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfp;
|
*at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfprime;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
__host__ void MatGFP::gpu_mul(const MatGFP &a, const MatGFP &b)
|
__host__ void MatGFP::gpu_mul(const MatGFP &a, const MatGFP &b)
|
||||||
{
|
{
|
||||||
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
||||||
|
|
||||||
dim3 block(THREAD_X, THREAD_Y);
|
dim3 block(THREAD_X, THREAD_Y);
|
||||||
dim3 grid((width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
dim3 grid((width - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
||||||
gfp_gpu_mul_kernel<<<grid, block>>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width);
|
gpu_mul_kernel<<<grid, block>>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -6,14 +6,6 @@
|
|||||||
|
|
||||||
#include <cpp_progress.hpp>
|
#include <cpp_progress.hpp>
|
||||||
|
|
||||||
// matrix
|
|
||||||
// #include <map>
|
|
||||||
// #include <vector>
|
|
||||||
|
|
||||||
// #include <algorithm>
|
|
||||||
// #include <numeric>
|
|
||||||
// #include <omp.h>
|
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
using base_t = uint64_t;
|
using base_t = uint64_t;
|
||||||
@ -25,13 +17,8 @@ static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01;
|
|||||||
|
|
||||||
static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;
|
static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF;
|
||||||
|
|
||||||
static const size_t THREAD_X = 32; // 列
|
static const size_t THREAD_X = 16; // 列
|
||||||
static const size_t THREAD_Y = 8; // 行
|
static const size_t THREAD_Y = 16; // 行
|
||||||
|
|
||||||
// __host__ __device__ base_t *at_base(base_t *base, size_t rowstride, size_t r, size_t w)
|
|
||||||
// {
|
|
||||||
// return base + r * rowstride + w;
|
|
||||||
// }
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
__host__ __device__ T *at_base(T *base, size_t rowstride, size_t r, size_t w)
|
__host__ __device__ T *at_base(T *base, size_t rowstride, size_t r, size_t w)
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
|
|
||||||
#undef SHOW_PROGRESS_BAR
|
#undef SHOW_PROGRESS_BAR
|
||||||
|
|
||||||
|
using namespace gfp;
|
||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
int m = 1000, k = 1000, n = 1000;
|
int m = 1000, k = 1000, n = 1000;
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include "test_header.cuh"
|
#include "test_header.cuh"
|
||||||
|
|
||||||
|
using namespace gf256;
|
||||||
|
|
||||||
bool test_gf256_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, const GF256 &gf256, uint_fast32_t seed)
|
bool test_gf256_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, const GF256 &gf256, uint_fast32_t seed)
|
||||||
{
|
{
|
||||||
assert(rank <= nrows && rank <= rank_col && rank_col <= ncols);
|
assert(rank <= nrows && rank <= rank_col && rank_col <= ncols);
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include "test_header.cuh"
|
#include "test_header.cuh"
|
||||||
|
|
||||||
|
using namespace gf256;
|
||||||
|
|
||||||
vector<gf256_t> expect_inv_table{
|
vector<gf256_t> expect_inv_table{
|
||||||
0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96,
|
0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96,
|
||||||
0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A,
|
0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A,
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include "test_header.cuh"
|
#include "test_header.cuh"
|
||||||
|
|
||||||
|
using namespace gf256;
|
||||||
|
|
||||||
TEST(TestGF256Matrix, Equal)
|
TEST(TestGF256Matrix, Equal)
|
||||||
{
|
{
|
||||||
MatGF256 a(50, 50);
|
MatGF256 a(50, 50);
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include "test_header.cuh"
|
#include "test_header.cuh"
|
||||||
|
|
||||||
|
using namespace gfp;
|
||||||
|
|
||||||
bool test_gfp_mul(size_t m, size_t k, size_t n, uint_fast32_t seed)
|
bool test_gfp_mul(size_t m, size_t k, size_t n, uint_fast32_t seed)
|
||||||
{
|
{
|
||||||
MatGFP a(m, k);
|
MatGFP a(m, k);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user