diff --git a/benchmark/bench_gf256_mul.cu b/benchmark/bench_gf256_mul.cu index 1abe6e9..9bdf7e7 100644 --- a/benchmark/bench_gf256_mul.cu +++ b/benchmark/bench_gf256_mul.cu @@ -1,6 +1,8 @@ #include #include "cuelim.cuh" +using namespace gf256; + template void bench_gf256_mul(benchmark::State &state) { diff --git a/benchmark/bench_gfp_mul.cu b/benchmark/bench_gfp_mul.cu index f1ebd9f..c90418a 100644 --- a/benchmark/bench_gfp_mul.cu +++ b/benchmark/bench_gfp_mul.cu @@ -1,6 +1,8 @@ #include #include "test_header.cuh" +using namespace gfp; + static void bench_gfp(benchmark::State &state) { uint_fast32_t seed = 41921095; diff --git a/include/gf256/gf256_elim.cuh b/include/gf256/gf256_elim.cuh index 99ae863..2e352c9 100644 --- a/include/gf256/gf256_elim.cuh +++ b/include/gf256/gf256_elim.cuh @@ -3,190 +3,193 @@ #include "gf256_mat.cuh" -void MatGF256::cpu_swap_row(size_t r1, size_t r2) +namespace gf256 { - if (r1 == r2) + void MatGF256::cpu_swap_row(size_t r1, size_t r2) { - return; - } - base_t *p1 = at_base(r1, 0); - base_t *p2 = at_base(r2, 0); - for (size_t i = 0; i < width; i++) - { - base_t temp = p1[i]; - p1[i] = p2[i]; - p2[i] = temp; - } -} - -size_t gf256_cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector &p_col, vector &p_row, const GF256 &gf) -{ - size_t rank = 0; - size_t pivot[gf256_num]; - size_t next[gf256_num]; - for (size_t pivot_col = 0; pivot_col < gf256_num; pivot_col++) - { - for (size_t r = rank; r < base_col_len; r++) + if (r1 == r2) { - for (size_t i = 0; i < rank; i++) + return; + } + base_t *p1 = at_base(r1, 0); + base_t *p2 = at_base(r2, 0); + for (size_t i = 0; i < width; i++) + { + base_t temp = p1[i]; + p1[i] = p2[i]; + p2[i] = temp; + } + } + + size_t cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector &p_col, vector &p_row, const GF256 &gf) + { + size_t rank = 0; + size_t pivot[gf256_num]; + size_t next[gf256_num]; + for (size_t pivot_col = 0; pivot_col < gf256_num; pivot_col++) + { + for (size_t r = rank; r < base_col_len; r++) { - if (next[i] == r) + for (size_t i = 0; i < rank; i++) { - base_col[r] ^= gf.mul_base(get8(base_col[r], pivot[i]), base_col[i], pivot[i] + 1); - next[i]++; + if (next[i] == r) + { + base_col[r] ^= gf.mul_base(get8(base_col[r], pivot[i]), base_col[i], pivot[i] + 1); + next[i]++; + } + } + + if (get8(base_col[r], pivot_col) != 0) + { + p_col.push_back(w * gf256_num + pivot_col); + p_row.push_back(st_r + r); + if (r != rank) + { + base_t temp = base_col[rank]; + base_col[rank] = base_col[r]; + base_col[r] = temp; + } + base_col[rank] = concat8(base_col[rank], pivot_col + 1, gf.mul_base(gf.inv(get8(base_col[rank], pivot_col)), base_col[rank], pivot_col + 1)); + pivot[rank] = pivot_col; + next[rank] = rank + 1; + rank++; + break; } } + } + return rank; + } - if (get8(base_col[r], pivot_col) != 0) + __global__ void gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width) + { + size_t w = blockIdx.x * blockDim.x + threadIdx.x; + if (w >= width) + { + return; + } + base_t temp[gf256_num]; + for (size_t r = 0; r < src_rank; r++) + { + temp[r] = *at_base(src, s_rowstride, r, w); + } + for (size_t r = 0; r < src_rank; r++) + { + for (size_t i = 0; i < r; i++) { - p_col.push_back(w * gf256_num + pivot_col); - p_row.push_back(st_r + r); - if (r != rank) + temp[r] ^= mul_base(get8(spL[r], i), temp[i]); + } + temp[r] = mul_base(get8(spL[r], r), temp[r]); + } + for (size_t rr = 1; rr < src_rank; rr++) + { + size_t r = src_rank - 1 - rr; + for (size_t i = r + 1; i < src_rank; i++) + { + temp[r] ^= mul_base(get8(spL[r], i), temp[i]); + } + } + for (size_t r = 0; r < src_rank; r++) + { + *at_base(src, s_rowstride, r, w) = temp[r]; + } + } + + __global__ void gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows) + { + size_t w = blockIdx.x * blockDim.x + threadIdx.x; + size_t r = blockIdx.y * blockDim.y + threadIdx.y; + + if (w >= width || r >= nrows || (r >= st_skip && r < st_skip + rank)) + { + return; + } + + base_t val = idx[r]; + base_t temp = base_zero; + for (size_t i = 0; i < rank; i++) + { + temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, get8(pivot_base, i)), w); + } + *at_base(data, rowstride, r, w) ^= temp; + } + + __managed__ base_t spL[gf256_num]; + + __host__ ElimResult MatGF256::gpu_elim(const GF256 &gf) + { + gf.cpy_to_constant(); + MatGF256 tb(gf256_num * (1 << gf256_len), ncols); + + base_t *base_col; + cudaMallocManaged(&base_col, nrows * sizeof(base_t)); + base_t *idx; + cudaMallocManaged(&idx, nrows * sizeof(base_t)); + + size_t rank = 0; + vector p_col, p_row; + + progress::ProgressBar pb("GPU ELIMINATE", width); + for (size_t w = 0; w < width; w++, pb.tick_display()) + { + CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault)); + + size_t src_rank = cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf); + + if (src_rank == 0) + { + continue; + } + + for (size_t i = 0; i < src_rank; i++) + { + cpu_swap_row(rank + i, p_row[rank + i]); + spL[i] = base_zero; + } + + base_t pivot_base = base_zero; + for (size_t r = 0; r < src_rank; r++) + { + size_t loc = (p_col[rank + r] - w * gf256_num); + set8(spL[r], gf.inv(get8(base_col[rank + r], loc)), r); + for (size_t i = 0; i < r; i++) { - base_t temp = base_col[rank]; - base_col[rank] = base_col[r]; - base_col[r] = temp; + set8(spL[i], get8(base_col[rank + i], loc), r); } - base_col[rank] = concat8(base_col[rank], pivot_col + 1, gf.mul_base(gf.inv(get8(base_col[rank], pivot_col)), base_col[rank], pivot_col + 1)); - pivot[rank] = pivot_col; - next[rank] = rank + 1; - rank++; + for (size_t i = r + 1; i < src_rank; i++) + { + set8(spL[i], get8(base_col[rank + i], loc), r); + } + set8(pivot_base, loc, r); + } + + dim3 block_src(THREAD_X); + dim3 grid_src((width - w - 1) / block_src.x + 1); + gpu_mksrc_kernel<<>>(at_base(rank, w), rowstride, spL, src_rank, width); + cudaDeviceSynchronize(); + + dim3 block_tb(THREAD_X, THREAD_Y); + dim3 grid_tb((width - w - 1) / block_tb.x + 1, (src_rank * (1 << gf256_len) - 1) / block_tb.y + 1); + gpu_mktb_kernel<<>>(tb.data, tb.rowstride, at_base(rank, w), rowstride, tb.width); + cudaDeviceSynchronize(); + + CUDA_CHECK(cudaMemcpy2D(idx, sizeof(base_t), at_base(0, w), rowstride * sizeof(base_t), sizeof(base_t), nrows, cudaMemcpyDefault)); + + dim3 block(THREAD_X, THREAD_Y); + dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1); + gpu_elim_kernel<<>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows); + cudaDeviceSynchronize(); + + rank += src_rank; + + if (rank == nrows) + { break; } } + cudaFree(base_col); + cudaFree(idx); + return {rank, p_col, p_row}; } - return rank; -} - -__global__ void gf256_gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *spL, size_t src_rank, size_t width) -{ - size_t w = blockIdx.x * blockDim.x + threadIdx.x; - if (w >= width) - { - return; - } - base_t temp[gf256_num]; - for (size_t r = 0; r < src_rank; r++) - { - temp[r] = *at_base(src, s_rowstride, r, w); - } - for (size_t r = 0; r < src_rank; r++) - { - for (size_t i = 0; i < r; i++) - { - temp[r] ^= mul_base(get8(spL[r], i), temp[i]); - } - temp[r] = mul_base(get8(spL[r], r), temp[r]); - } - for (size_t rr = 1; rr < src_rank; rr++) - { - size_t r = src_rank - 1 - rr; - for (size_t i = r + 1; i < src_rank; i++) - { - temp[r] ^= mul_base(get8(spL[r], i), temp[i]); - } - } - for (size_t r = 0; r < src_rank; r++) - { - *at_base(src, s_rowstride, r, w) = temp[r]; - } -} - -__global__ void gf256_gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, base_t pivot_base, size_t st_skip, size_t width, size_t nrows) -{ - size_t w = blockIdx.x * blockDim.x + threadIdx.x; - size_t r = blockIdx.y * blockDim.y + threadIdx.y; - - if (w >= width || r >= nrows || (r >= st_skip && r < st_skip + rank)) - { - return; - } - - base_t val = idx[r]; - base_t temp = base_zero; - for (size_t i = 0; i < rank; i++) - { - temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, get8(pivot_base, i)), w); - } - *at_base(data, rowstride, r, w) ^= temp; -} - -__managed__ base_t spL[gf256_num]; - -__host__ ElimResult MatGF256::gpu_elim(const GF256 &gf) -{ - gf.cpy_to_constant(); - MatGF256 tb(gf256_num * (1 << gf256_len), ncols); - - base_t *base_col; - cudaMallocManaged(&base_col, nrows * sizeof(base_t)); - base_t *idx; - cudaMallocManaged(&idx, nrows * sizeof(base_t)); - - size_t rank = 0; - vector p_col, p_row; - - progress::ProgressBar pb("GPU ELIMINATE", width); - for (size_t w = 0; w < width; w++, pb.tick_display()) - { - CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault)); - - size_t src_rank = gf256_cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row, gf); - - if (src_rank == 0) - { - continue; - } - - for (size_t i = 0; i < src_rank; i++) - { - cpu_swap_row(rank + i, p_row[rank + i]); - spL[i] = base_zero; - } - - base_t pivot_base = base_zero; - for (size_t r = 0; r < src_rank; r++) - { - size_t loc = (p_col[rank + r] - w * gf256_num); - set8(spL[r], gf.inv(get8(base_col[rank + r], loc)), r); - for (size_t i = 0; i < r; i++) - { - set8(spL[i], get8(base_col[rank + i], loc), r); - } - for (size_t i = r + 1; i < src_rank; i++) - { - set8(spL[i], get8(base_col[rank + i], loc), r); - } - set8(pivot_base, loc, r); - } - - dim3 block_src(THREAD_X); - dim3 grid_src((width - w - 1) / block_src.x + 1); - gf256_gpu_mksrc_kernel<<>>(at_base(rank, w), rowstride, spL, src_rank, width); - cudaDeviceSynchronize(); - - dim3 block_tb(THREAD_X, THREAD_Y); - dim3 grid_tb((width - w - 1) / block_tb.x + 1, (src_rank * (1 << gf256_len) - 1) / block_tb.y + 1); - gpu_mktb_kernel<<>>(tb.data, tb.rowstride, at_base(rank, w), rowstride, tb.width); - cudaDeviceSynchronize(); - - CUDA_CHECK(cudaMemcpy2D(idx, sizeof(base_t), at_base(0, w), rowstride * sizeof(base_t), sizeof(base_t), nrows, cudaMemcpyDefault)); - - dim3 block(THREAD_X, THREAD_Y); - dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1); - gf256_gpu_elim_kernel<<>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, pivot_base, rank, width - w, nrows); - cudaDeviceSynchronize(); - - rank += src_rank; - - if (rank == nrows) - { - break; - } - } - cudaFree(base_col); - cudaFree(idx); - return {rank, p_col, p_row}; } #endif \ No newline at end of file diff --git a/include/gf256/gf256_header.cuh b/include/gf256/gf256_header.cuh index d94b81f..cb2b0a3 100755 --- a/include/gf256/gf256_header.cuh +++ b/include/gf256/gf256_header.cuh @@ -4,189 +4,191 @@ #include "../header.cuh" #include -using gf256_t = uint8_t; - -static const size_t gf256_len = sizeof(gf256_t) * 8; -static const size_t gf256_num = base_len / gf256_len; - -static const gf256_t gf256_zero = (gf256_t)0x00; -static const gf256_t gf256_one = (gf256_t)0x01; -static const gf256_t gf256_fullmask = (gf256_t)0xFF; - -static const base_t gf256_mask[8] = { - (base_t)0x00'00'00'00'00'00'00'FF, - (base_t)0x00'00'00'00'00'00'FF'00, - (base_t)0x00'00'00'00'00'FF'00'00, - (base_t)0x00'00'00'00'FF'00'00'00, - (base_t)0x00'00'00'FF'00'00'00'00, - (base_t)0x00'00'FF'00'00'00'00'00, - (base_t)0x00'FF'00'00'00'00'00'00, - (base_t)0xFF'00'00'00'00'00'00'00}; - -__host__ __device__ inline size_t offset8(size_t idx) +namespace gf256 { - return idx << 3; -} + using gf256_t = uint8_t; -__host__ __device__ inline gf256_t get8(base_t src, size_t idx) -{ - return (gf256_t)(src >> offset8(idx)); -} + static const size_t gf256_len = sizeof(gf256_t) * 8; + static const size_t gf256_num = base_len / gf256_len; -// 确保set8对应位置的值为0 -__host__ __device__ inline void set8(base_t &dst, gf256_t src, size_t idx) -{ - dst |= (base_t)src << offset8(idx); -} + static const gf256_t gf256_zero = (gf256_t)0x00; + static const gf256_t gf256_one = (gf256_t)0x01; + static const gf256_t gf256_fullmask = (gf256_t)0xFF; -__host__ inline void del8(base_t &dst, size_t idx) -{ - dst &= ~gf256_mask[idx]; -} + static const base_t gf256_mask[8] = { + (base_t)0x00'00'00'00'00'00'00'FF, + (base_t)0x00'00'00'00'00'00'FF'00, + (base_t)0x00'00'00'00'00'FF'00'00, + (base_t)0x00'00'00'00'FF'00'00'00, + (base_t)0x00'00'00'FF'00'00'00'00, + (base_t)0x00'00'FF'00'00'00'00'00, + (base_t)0x00'FF'00'00'00'00'00'00, + (base_t)0xFF'00'00'00'00'00'00'00}; -__host__ inline base_t concat8(base_t dst_l, size_t idx_l, base_t dst_r) -{ - if (idx_l == 0) + __host__ __device__ inline size_t offset8(size_t idx) { - return dst_r; - } - if (idx_l == gf256_num) - { - return dst_l; - } - return (dst_l & (base_fullmask >> (base_len - offset8(idx_l)))) | (dst_r & (base_fullmask << offset8(idx_l))); -} - -__host__ inline base_t rev8(base_t n) -{ - n = (n & (base_t)0xFF'00'FF'00'FF'00'FF'00) >> 8 | (n & (base_t)0x00'FF'00'FF'00'FF'00'FF) << 8; - n = (n & (base_t)0xFF'FF'00'00'FF'FF'00'00) >> 16 | (n & (base_t)0x00'00'FF'FF'00'00'FF'FF) << 16; - return n >> 32 | n << 32; -} - -__constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len]; - -__device__ inline base_t mul_base(const gf256_t val, const base_t base) -{ - if (val == 0) - { - return base_zero; - } - base_t temp = base_zero; - for (size_t i = 0; i < gf256_len; i++) - { - set8(temp, d_mul_table[val][get8(base, i)], i); - } - return temp; -} - -__global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width) -{ - size_t w = blockIdx.x * blockDim.x + threadIdx.x; - size_t r = blockIdx.y * blockDim.y + threadIdx.y; - - if (w >= width) - { - return; + return idx << 3; } - gf256_t val = get8(r, 0); - base_t s = *at_base(src, s_rowstride, get8(r, 1), w); - base_t d = mul_base(val, s); - *at_base(tb, tb_rowstride, r, w) = d; -} - -static const set irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9}; - -class GF256 -{ -public: - GF256(base_t poly) + __host__ __device__ inline gf256_t get8(base_t src, size_t idx) { - assert(irreducible_polynomials_degree_08.count(poly) == 1); - this->poly = poly; - for (size_t x = 0; x < (1 << gf256_len); x++) + return (gf256_t)(src >> offset8(idx)); + } + + // 确保set8对应位置的值为0 + __host__ __device__ inline void set8(base_t &dst, gf256_t src, size_t idx) + { + dst |= (base_t)src << offset8(idx); + } + + __host__ inline void del8(base_t &dst, size_t idx) + { + dst &= ~gf256_mask[idx]; + } + + __host__ inline base_t concat8(base_t dst_l, size_t idx_l, base_t dst_r) + { + if (idx_l == 0) { - mul_table[x][gf256_zero] = gf256_zero; - for (size_t d = 0; d < gf256_len; d++) + return dst_r; + } + if (idx_l == gf256_num) + { + return dst_l; + } + return (dst_l & (base_fullmask >> (base_len - offset8(idx_l)))) | (dst_r & (base_fullmask << offset8(idx_l))); + } + + __host__ inline base_t rev8(base_t n) + { + n = (n & (base_t)0xFF'00'FF'00'FF'00'FF'00) >> 8 | (n & (base_t)0x00'FF'00'FF'00'FF'00'FF) << 8; + n = (n & (base_t)0xFF'FF'00'00'FF'FF'00'00) >> 16 | (n & (base_t)0x00'00'FF'FF'00'00'FF'FF) << 16; + return n >> 32 | n << 32; + } + + __constant__ gf256_t d_mul_table[1 << gf256_len][1 << gf256_len]; + + __device__ inline base_t mul_base(const gf256_t val, const base_t base) + { + if (val == 0) + { + return base_zero; + } + base_t temp = base_zero; + for (size_t i = 0; i < gf256_len; i++) + { + set8(temp, d_mul_table[val][get8(base, i)], i); + } + return temp; + } + + __global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *src, size_t s_rowstride, size_t width) + { + size_t w = blockIdx.x * blockDim.x + threadIdx.x; + size_t r = blockIdx.y * blockDim.y + threadIdx.y; + + if (w >= width) + { + return; + } + + gf256_t val = get8(r, 0); + base_t s = *at_base(src, s_rowstride, get8(r, 1), w); + base_t d = mul_base(val, s); + *at_base(tb, tb_rowstride, r, w) = d; + } + + static const set irreducible_polynomials_degree_08{0x11b, 0x11d, 0x12b, 0x12d, 0x139, 0x13f, 0x14d, 0x15f, 0x163, 0x165, 0x169, 0x171, 0x177, 0x17b, 0x187, 0x18b, 0x18d, 0x19f, 0x1a3, 0x1a9, 0x1b1, 0x1bd, 0x1c3, 0x1cf, 0x1d7, 0x1dd, 0x1e7, 0x1f3, 0x1f5, 0x1f9}; + + class GF256 + { + public: + GF256(base_t poly) + { + assert(irreducible_polynomials_degree_08.count(poly) == 1); + this->poly = poly; + for (size_t x = 0; x < (1 << gf256_len); x++) { - gf256_t val = shift_left(x, d); - for (size_t y = (1 << d); y < (1 << (d + 1)); y++) + mul_table[x][gf256_zero] = gf256_zero; + for (size_t d = 0; d < gf256_len; d++) { - mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)]; - if (mul_table[x][y] == gf256_one) + gf256_t val = shift_left(x, d); + for (size_t y = (1 << d); y < (1 << (d + 1)); y++) { - inv_table[x] = y; + mul_table[x][y] = val ^ mul_table[x][y ^ (1 << d)]; + if (mul_table[x][y] == gf256_one) + { + inv_table[x] = y; + } } } } + inv_table[gf256_zero] = gf256_zero; } - inv_table[gf256_zero] = gf256_zero; - } - gf256_t mul(const gf256_t x, const gf256_t y) const - { - return mul_table[x][y]; - } - - base_t mul_base(const gf256_t val, const base_t base, const size_t offset = 0) const - { - base_t temp = base_zero; - for (size_t i = offset; i < gf256_num; i++) + gf256_t mul(const gf256_t x, const gf256_t y) const { - set8(temp, mul(val, get8(base, i)), i); + return mul_table[x][y]; } - return temp; - } - gf256_t inv(gf256_t x) const - { - return inv_table[x]; - } - - inline cudaError_t cpy_to_constant() const - { - return cudaMemcpyToSymbol(d_mul_table, mul_table, (1 << gf256_len) * (1 << gf256_len) * sizeof(gf256_t)); - } - - friend ostream &operator<<(ostream &out, const GF256 &gf); - - GF256() = delete; - GF256(const GF256 &) = delete; - GF256(GF256 &&) = delete; - GF256 &operator=(const GF256 &) = delete; - GF256 &operator=(GF256 &&) = delete; - -private: - gf256_t shift_left(gf256_t x, size_t d) - { - base_t temp = (base_t)x << d; - for (size_t i = gf256_len - 1 + d; i > gf256_len - 1; i--) + base_t mul_base(const gf256_t val, const base_t base, const size_t offset = 0) const { - if (temp & (1 << i)) + base_t temp = base_zero; + for (size_t i = offset; i < gf256_num; i++) { - temp ^= poly << (i - gf256_len); + set8(temp, mul(val, get8(base, i)), i); } + return temp; } - return temp; - } - base_t poly; - gf256_t inv_table[1 << gf256_num]; - gf256_t mul_table[1 << gf256_num][1 << gf256_num]; -}; - -ostream &operator<<(ostream &out, const GF256 &gf) -{ - for (size_t x = 0; x < 1 << gf256_len; x++) - { - for (size_t y = 0; y < 1 << gf256_len; y++) + gf256_t inv(gf256_t x) const { - printf("%02X ", gf.mul_table[x][y]); + return inv_table[x]; } - printf("\n"); - } - return out; -} + inline cudaError_t cpy_to_constant() const + { + return cudaMemcpyToSymbol(d_mul_table, mul_table, (1 << gf256_len) * (1 << gf256_len) * sizeof(gf256_t)); + } + + friend ostream &operator<<(ostream &out, const GF256 &gf); + + GF256() = delete; + GF256(const GF256 &) = delete; + GF256(GF256 &&) = delete; + GF256 &operator=(const GF256 &) = delete; + GF256 &operator=(GF256 &&) = delete; + + private: + gf256_t shift_left(gf256_t x, size_t d) + { + base_t temp = (base_t)x << d; + for (size_t i = gf256_len - 1 + d; i > gf256_len - 1; i--) + { + if (temp & (1 << i)) + { + temp ^= poly << (i - gf256_len); + } + } + return temp; + } + + base_t poly; + gf256_t inv_table[1 << gf256_num]; + gf256_t mul_table[1 << gf256_num][1 << gf256_num]; + }; + + ostream &operator<<(ostream &out, const GF256 &gf) + { + for (size_t x = 0; x < 1 << gf256_len; x++) + { + for (size_t y = 0; y < 1 << gf256_len; y++) + { + printf("%02X ", gf.mul_table[x][y]); + } + printf("\n"); + } + return out; + } +} #endif \ No newline at end of file diff --git a/include/gf256/gf256_mat.cuh b/include/gf256/gf256_mat.cuh index 0689635..4d14593 100755 --- a/include/gf256/gf256_mat.cuh +++ b/include/gf256/gf256_mat.cuh @@ -7,220 +7,223 @@ #include #include -struct ElimResult +namespace gf256 { - size_t rank; - vector pivot; - vector swap_row; -}; - -class MatGF256 -{ -public: - enum MatType + struct ElimResult { - root, - window, - moved, + size_t rank; + vector pivot; + vector swap_row; }; - // 只能构造root矩阵 - MatGF256(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) + + class MatGF256 { - width = (ncols - 1) / gf256_num + 1; - rowstride = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐 - CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(base_t))); - CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(base_t))); - } - // 只能以base_t为单位建立window矩阵 - MatGF256(const MatGF256 &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols((end_wj == src.width ? src.ncols : end_wj * gf256_num) - begin_wi * gf256_num), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi)) - { - assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width); - } - // 只能拷贝构造root矩阵 - MatGF256(const MatGF256 &m) : MatGF256(m.nrows, m.ncols) - { - CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); - } - MatGF256(MatGF256 &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data) - { - m.type = moved; - m.data = nullptr; - } - MatGF256 &operator=(const MatGF256 &m) - { - if (this == &m) + public: + enum MatType { + root, + window, + moved, + }; + // 只能构造root矩阵 + MatGF256(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) + { + width = (ncols - 1) / gf256_num + 1; + rowstride = ((width - 1) / 4 + 1) * 4; // 以32字节(4*64bit)为单位对齐 + CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(base_t))); + CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(base_t))); + } + // 只能以base_t为单位建立window矩阵 + MatGF256(const MatGF256 &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols((end_wj == src.width ? src.ncols : end_wj * gf256_num) - begin_wi * gf256_num), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi)) + { + assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width); + } + // 只能拷贝构造root矩阵 + MatGF256(const MatGF256 &m) : MatGF256(m.nrows, m.ncols) + { + CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); + } + MatGF256(MatGF256 &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data) + { + m.type = moved; + m.data = nullptr; + } + MatGF256 &operator=(const MatGF256 &m) + { + if (this == &m) + { + return *this; + } + assert(nrows == m.nrows && ncols == m.ncols); + CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); return *this; } - assert(nrows == m.nrows && ncols == m.ncols); - CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(base_t), m.data, m.rowstride * sizeof(base_t), m.width * sizeof(base_t), nrows, cudaMemcpyDefault)); - return *this; - } - MatGF256 &operator=(MatGF256 &&m) noexcept - { - if (this == &m) + MatGF256 &operator=(MatGF256 &&m) noexcept { + if (this == &m) + { + return *this; + } + if (type == root) + { + CUDA_CHECK(cudaFree(data)); + } + nrows = m.nrows; + ncols = m.ncols; + width = m.width; + rowstride = m.rowstride; + type = m.type; + data = m.data; + m.type = moved; + m.data = nullptr; return *this; } - if (type == root) - { - CUDA_CHECK(cudaFree(data)); - } - nrows = m.nrows; - ncols = m.ncols; - width = m.width; - rowstride = m.rowstride; - type = m.type; - data = m.data; - m.type = moved; - m.data = nullptr; - return *this; - } - ~MatGF256() - { - if (type == root) + ~MatGF256() { - CUDA_CHECK(cudaFree(data)); - } - } - - inline base_t *at_base(size_t r, size_t w) const - { - return data + r * rowstride + w; - } - - void randomize(uint_fast32_t seed) - { - assert(type == root); - static default_random_engine e(seed); - static uniform_int_distribution d; - base_t lastmask = base_fullmask >> (width * base_len - ncols * gf256_len); - for (size_t r = 0; r < nrows; r++) - { - for (size_t w = 0; w < width; w++) + if (type == root) { - *at_base(r, w) = d(e); - } - *at_base(r, width - 1) &= lastmask; - } - } - - // 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列 - void randomize(size_t rank_col, uint_fast32_t seed) - { - assert(nrows <= rank_col && rank_col <= ncols); - randomize(seed); - vector pivot(rank_col); - iota(pivot.begin(), pivot.end(), 0); - random_shuffle(pivot.begin(), pivot.end()); - pivot.resize(nrows); - sort(pivot.begin(), pivot.end()); - - vector pivotmask(width, base_fullmask); - for (size_t r = 0; r < nrows; r++) - { - del8(pivotmask[pivot[r] / gf256_num], pivot[r] % gf256_num); - } - - for (size_t r = 0; r < nrows; r++) - { - for (size_t w = 0; w < pivot[r] / gf256_num; w++) - { - *at_base(r, w) = base_zero; - } - base_t *now = at_base(r, pivot[r] / gf256_num); - *now = concat8(base_zero, pivot[r] % gf256_num + 1, *now & pivotmask[pivot[r] / gf256_num]); - set8(*now, gf256_one, pivot[r] % gf256_num); - for (size_t w = pivot[r] / gf256_num + 1; w < rank_col / gf256_num + 1; w++) - { - *at_base(r, w) &= pivotmask[w]; + CUDA_CHECK(cudaFree(data)); } } - } - bool operator==(const MatGF256 &m) const - { - if (nrows != m.nrows || ncols != m.ncols) + inline base_t *at_base(size_t r, size_t w) const { - return false; + return data + r * rowstride + w; } - for (size_t r = 0; r < nrows; r++) + + void randomize(uint_fast32_t seed) { - for (size_t w = 0; w < width; w++) + assert(type == root); + static default_random_engine e(seed); + static uniform_int_distribution d; + base_t lastmask = base_fullmask >> (width * base_len - ncols * gf256_len); + for (size_t r = 0; r < nrows; r++) { - if (*at_base(r, w) != *m.at_base(r, w)) + for (size_t w = 0; w < width; w++) { - return false; + *at_base(r, w) = d(e); + } + *at_base(r, width - 1) &= lastmask; + } + } + + // 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列 + void randomize(size_t rank_col, uint_fast32_t seed) + { + assert(nrows <= rank_col && rank_col <= ncols); + randomize(seed); + vector pivot(rank_col); + iota(pivot.begin(), pivot.end(), 0); + random_shuffle(pivot.begin(), pivot.end()); + pivot.resize(nrows); + sort(pivot.begin(), pivot.end()); + + vector pivotmask(width, base_fullmask); + for (size_t r = 0; r < nrows; r++) + { + del8(pivotmask[pivot[r] / gf256_num], pivot[r] % gf256_num); + } + + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < pivot[r] / gf256_num; w++) + { + *at_base(r, w) = base_zero; + } + base_t *now = at_base(r, pivot[r] / gf256_num); + *now = concat8(base_zero, pivot[r] % gf256_num + 1, *now & pivotmask[pivot[r] / gf256_num]); + set8(*now, gf256_one, pivot[r] % gf256_num); + for (size_t w = pivot[r] / gf256_num + 1; w < rank_col / gf256_num + 1; w++) + { + *at_base(r, w) &= pivotmask[w]; } } } - return true; - } - bool operator==(const base_t base) const - { - for (size_t r = 0; r < nrows; r++) + bool operator==(const MatGF256 &m) const { - for (size_t w = 0; w < width; w++) + if (nrows != m.nrows || ncols != m.ncols) { - if (*at_base(r, w) != base) + return false; + } + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < width; w++) { - return false; + if (*at_base(r, w) != *m.at_base(r, w)) + { + return false; + } + } + } + return true; + } + + bool operator==(const base_t base) const + { + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < width; w++) + { + if (*at_base(r, w) != base) + { + return false; + } + } + } + return true; + } + void operator^=(const MatGF256 &m) + { + assert(nrows == m.nrows && ncols == m.ncols); + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < width; w++) + { + *at_base(r, w) ^= *m.at_base(r, w); } } } - return true; - } - void operator^=(const MatGF256 &m) - { - assert(nrows == m.nrows && ncols == m.ncols); - for (size_t r = 0; r < nrows; r++) + MatGF256 operator^(const MatGF256 &m) const { - for (size_t w = 0; w < width; w++) + MatGF256 temp(*this); + temp ^= m; + return temp; + } + + void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); + friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); + + // size_t cpu_elim_base(base_t *base_col, size_t st_r, size_t w, vector &p_col, vector &p_row, base_t step[gf256_num], const GF256 &gf); + void cpu_swap_row(size_t r1, size_t r2); + // void cpu_mul_row(size_t r, gf256_t val, const GF256 &gf); + ElimResult gpu_elim(const GF256 &gf); + + friend ostream &operator<<(ostream &out, const MatGF256 &m); + + size_t nrows, ncols, width; + + private: + MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {} + + size_t rowstride; + MatType type; + base_t *data; + }; + + ostream &operator<<(ostream &out, const MatGF256 &m) + { + for (size_t r = 0; r < m.nrows; r++) + { + for (size_t w = 0; w < m.width; w++) { - *at_base(r, w) ^= *m.at_base(r, w); + printf("%016lX ", rev8(*m.at_base(r, w))); } + printf("\n"); } + return out; } - MatGF256 operator^(const MatGF256 &m) const - { - MatGF256 temp(*this); - temp ^= m; - return temp; - } - - void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); - friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); - - // size_t cpu_elim_base(base_t *base_col, size_t st_r, size_t w, vector &p_col, vector &p_row, base_t step[gf256_num], const GF256 &gf); - void cpu_swap_row(size_t r1, size_t r2); - // void cpu_mul_row(size_t r, gf256_t val, const GF256 &gf); - ElimResult gpu_elim(const GF256 &gf); - - friend ostream &operator<<(ostream &out, const MatGF256 &m); - - size_t nrows, ncols, width; - -private: - MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {} - - size_t rowstride; - MatType type; - base_t *data; -}; - -ostream &operator<<(ostream &out, const MatGF256 &m) -{ - for (size_t r = 0; r < m.nrows; r++) - { - for (size_t w = 0; w < m.width; w++) - { - printf("%016lX ", rev8(*m.at_base(r, w))); - } - printf("\n"); - } - return out; } #endif \ No newline at end of file diff --git a/include/gf256/gf256_mul.cuh b/include/gf256/gf256_mul.cuh index eb05584..e0a5464 100644 --- a/include/gf256/gf256_mul.cuh +++ b/include/gf256/gf256_mul.cuh @@ -3,54 +3,57 @@ #include "gf256_mat.cuh" -__global__ void gf256_gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows) +namespace gf256 { - size_t w = blockIdx.x * blockDim.x + threadIdx.x; - size_t r = blockIdx.y * blockDim.y + threadIdx.y; - - if (w >= width || r >= nrows) + __global__ void gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t tb_num, size_t width, size_t nrows) { - return; + size_t w = blockIdx.x * blockDim.x + threadIdx.x; + size_t r = blockIdx.y * blockDim.y + threadIdx.y; + + if (w >= width || r >= nrows) + { + return; + } + + base_t val = *at_base(a, a_rowstride, r, 0); + base_t temp = base_zero; + for (size_t i = 0; i < tb_num; i++) + { + temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w); + } + *at_base(c, c_rowstride, r, w) ^= temp; } - base_t val = *at_base(a, a_rowstride, r, 0); - base_t temp = base_zero; - for (size_t i = 0; i < tb_num; i++) + __host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf) { - temp ^= *at_base(tb, tb_rowstride, i * (1 << gf256_len) + get8(val, i), w); + assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); + gf.cpy_to_constant(); + MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols); + + progress::ProgressBar pb("GPU MULTIPLY", a.width); + for (size_t w = 0; w < a.width; w++, pb.tick_display()) + { + size_t tb_num = min(gf256_num, a.ncols - w * gf256_num); + + dim3 block_tb(THREAD_X, THREAD_Y); + dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb_num * (1 << gf256_len) - 1) / block_tb.y + 1); + gpu_mktb_kernel<<>>(tb.data, tb.rowstride, b.at_base(w * gf256_num, 0), b.rowstride, tb.width); + cudaDeviceSynchronize(); + + dim3 block(THREAD_X, THREAD_Y); + dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1); + gpu_addmul_kernel<<>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows); + cudaDeviceSynchronize(); + } } - *at_base(c, c_rowstride, r, w) ^= temp; -} -__host__ void MatGF256::gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf) -{ - assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); - gf.cpy_to_constant(); - MatGF256 tb(gf256_num * (1 << gf256_len), b.ncols); - - progress::ProgressBar pb("GPU MULTIPLY", a.width); - for (size_t w = 0; w < a.width; w++, pb.tick_display()) + __host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf) { - size_t tb_num = min(gf256_num, a.ncols - w * gf256_num); - - dim3 block_tb(THREAD_X, THREAD_Y); - dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb_num * (1 << gf256_len) - 1) / block_tb.y + 1); - gpu_mktb_kernel<<>>(tb.data, tb.rowstride, b.at_base(w * gf256_num, 0), b.rowstride, tb.width); - cudaDeviceSynchronize(); - - dim3 block(THREAD_X, THREAD_Y); - dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1); - gf256_gpu_addmul_kernel<<>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, tb_num, width, nrows); - cudaDeviceSynchronize(); + assert(a.ncols == b.nrows); + MatGF256 c(a.nrows, b.ncols); + c.gpu_addmul(a, b, gf); + return c; } } -__host__ MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf) -{ - assert(a.ncols == b.nrows); - MatGF256 c(a.nrows, b.ncols); - c.gpu_addmul(a, b, gf); - return c; -} - #endif diff --git a/include/gfp/gfp_header.cuh b/include/gfp/gfp_header.cuh index d7cecf7..830479a 100644 --- a/include/gfp/gfp_header.cuh +++ b/include/gfp/gfp_header.cuh @@ -3,25 +3,25 @@ #include "../header.cuh" -using gfp_t = uint32_t; -#define gfp_bits 32 - -static_assert(sizeof(gfp_t) * 8 == gfp_bits); - -static const gfp_t gfp = 65521; - -static const gfp_t gfp_zero = (gfp_t)0; -static const gfp_t gfp_one = (gfp_t)1; -static const gfp_t gfp_fullmask = (gfp_t)0xFF'FF; - -__managed__ gfp_t gfp_inv_table[gfp]; - -void init_inv_table() +namespace gfp { - gfp_inv_table[0] = 0; - gfp_inv_table[1] = 1; - for (int i = 2; i < gfp; ++i) - gfp_inv_table[i] = (gfp - gfp / i) * gfp_inv_table[gfp % i] % gfp; + using gfp_t = uint32_t; + + static const gfp_t gfprime = 65521; + + static const gfp_t gfp_zero = (gfp_t)0; + static const gfp_t gfp_one = (gfp_t)1; + static const gfp_t gfp_fullmask = (gfp_t)0xFF'FF; + + __managed__ gfp_t gfp_inv_table[gfprime]; + + void init_inv_table() + { + gfp_inv_table[0] = 0; + gfp_inv_table[1] = 1; + for (int i = 2; i < gfprime; ++i) + gfp_inv_table[i] = (gfprime - gfprime / i) * gfp_inv_table[gfprime % i] % gfprime; + } } #endif \ No newline at end of file diff --git a/include/gfp/gfp_mat.cuh b/include/gfp/gfp_mat.cuh index 400e79c..69c7fff 100644 --- a/include/gfp/gfp_mat.cuh +++ b/include/gfp/gfp_mat.cuh @@ -7,236 +7,235 @@ #include #include -class MatGFP +namespace gfp { -public: - enum MatType + class MatGFP { - root, - window, - moved, - }; - // 只能构造root矩阵 - MatGFP(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) - { - width = ncols; - rowstride = ((width - 1) / 8 + 1) * 8; // 以32字节(8*32bit)为单位对齐 - CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(gfp_t))); - CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(gfp_t))); - } - // 只能以gfp_t为单位建立window矩阵 - MatGFP(const MatGFP &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols(end_wj - begin_wi), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi)) - { - assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width); - } - // 只能拷贝构造root矩阵 - MatGFP(const MatGFP &m) : MatGFP(m.nrows, m.ncols) - { - CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault)); - } - MatGFP(MatGFP &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data) - { - m.type = moved; - m.data = nullptr; - } - MatGFP &operator=(const MatGFP &m) - { - if (this == &m) + public: + enum MatType { + root, + window, + moved, + }; + // 只能构造root矩阵 + MatGFP(size_t nrows, size_t ncols) : nrows(nrows), ncols(ncols), type(root) + { + width = ncols; + rowstride = ((width - 1) / 8 + 1) * 8; // 以32字节(8*32bit)为单位对齐 + CUDA_CHECK(cudaMallocManaged((void **)&data, nrows * rowstride * sizeof(gfp_t))); + CUDA_CHECK(cudaMemset(data, 0, nrows * rowstride * sizeof(gfp_t))); + } + // 只能以gfp_t为单位建立window矩阵 + MatGFP(const MatGFP &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) : nrows(end_rj - begin_ri), ncols(end_wj - begin_wi), width(end_wj - begin_wi), rowstride(src.rowstride), type(window), data(src.at_base(begin_ri, begin_wi)) + { + assert(begin_ri < end_rj && end_rj <= src.nrows && begin_wi < end_wj && end_wj <= src.width); + } + // 只能拷贝构造root矩阵 + MatGFP(const MatGFP &m) : MatGFP(m.nrows, m.ncols) + { + CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault)); + } + MatGFP(MatGFP &&m) noexcept : nrows(m.nrows), ncols(m.ncols), width(m.width), rowstride(m.rowstride), type(m.type), data(m.data) + { + m.type = moved; + m.data = nullptr; + } + MatGFP &operator=(const MatGFP &m) + { + if (this == &m) + { + return *this; + } + assert(nrows == m.nrows && ncols == m.ncols); + CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault)); return *this; } - assert(nrows == m.nrows && ncols == m.ncols); - CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), m.data, m.rowstride * sizeof(gfp_t), m.width * sizeof(gfp_t), nrows, cudaMemcpyDefault)); - return *this; - } - MatGFP &operator=(MatGFP &&m) noexcept - { - if (this == &m) + MatGFP &operator=(MatGFP &&m) noexcept { + if (this == &m) + { + return *this; + } + if (type == root) + { + CUDA_CHECK(cudaFree(data)); + } + nrows = m.nrows; + ncols = m.ncols; + width = m.width; + rowstride = m.rowstride; + type = m.type; + data = m.data; + m.type = moved; + m.data = nullptr; return *this; } - if (type == root) - { - CUDA_CHECK(cudaFree(data)); - } - nrows = m.nrows; - ncols = m.ncols; - width = m.width; - rowstride = m.rowstride; - type = m.type; - data = m.data; - m.type = moved; - m.data = nullptr; - return *this; - } - ~MatGFP() - { - if (type == root) + ~MatGFP() { - CUDA_CHECK(cudaFree(data)); - } - } - - inline gfp_t *at_base(size_t r, size_t w) const - { - return data + r * rowstride + w; - } - - void randomize(uint_fast32_t seed) - { - assert(type == root); - static default_random_engine e(seed); - static uniform_int_distribution d; - for (size_t r = 0; r < nrows; r++) - { - for (size_t w = 0; w < width; w++) + if (type == root) { - *at_base(r, w) = d(e) % gfp; + CUDA_CHECK(cudaFree(data)); } } - } - // 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列 - void randomize(size_t rank_col, uint_fast32_t seed) - { - assert(nrows <= rank_col && rank_col <= ncols); - randomize(seed); - vector pivot(rank_col); - iota(pivot.begin(), pivot.end(), 0); - random_shuffle(pivot.begin(), pivot.end()); - pivot.resize(nrows); - sort(pivot.begin(), pivot.end()); - - vector pivotmask(width, true); - for (size_t r = 0; r < nrows; r++) + inline gfp_t *at_base(size_t r, size_t w) const { - pivotmask[pivot[r]] = false; + return data + r * rowstride + w; } - for (size_t r = 0; r < nrows; r++) + void randomize(uint_fast32_t seed) { - for (size_t w = 0; w < pivot[r]; w++) + assert(type == root); + static default_random_engine e(seed); + static uniform_int_distribution d; + for (size_t r = 0; r < nrows; r++) { - *at_base(r, w) = base_zero; + for (size_t w = 0; w < width; w++) + { + *at_base(r, w) = d(e) % gfprime; + } } - *at_base(r, pivot[r]) = base_one; - for (size_t w = pivot[r] + 1; w < rank_col; w++) + } + + // 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列 + void randomize(size_t rank_col, uint_fast32_t seed) + { + assert(nrows <= rank_col && rank_col <= ncols); + randomize(seed); + vector pivot(rank_col); + iota(pivot.begin(), pivot.end(), 0); + random_shuffle(pivot.begin(), pivot.end()); + pivot.resize(nrows); + sort(pivot.begin(), pivot.end()); + + vector pivotmask(width, true); + for (size_t r = 0; r < nrows; r++) { - if (!pivotmask[w]) + pivotmask[pivot[r]] = false; + } + + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < pivot[r]; w++) { *at_base(r, w) = base_zero; } - } - } - } - - bool operator==(const MatGFP &m) const - { - if (nrows != m.nrows || ncols != m.ncols) - { - return false; - } - for (size_t r = 0; r < nrows; r++) - { - for (size_t w = 0; w < width; w++) - { - if (*at_base(r, w) != *m.at_base(r, w)) + *at_base(r, pivot[r]) = base_one; + for (size_t w = pivot[r] + 1; w < rank_col; w++) { - return false; + if (!pivotmask[w]) + { + *at_base(r, w) = base_zero; + } } } } - return true; - } - bool operator==(const gfp_t base) const - { - for (size_t r = 0; r < nrows; r++) + bool operator==(const MatGFP &m) const { - for (size_t w = 0; w < width; w++) + if (nrows != m.nrows || ncols != m.ncols) { - if (*at_base(r, w) != base) + return false; + } + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < width; w++) { - return false; + if (*at_base(r, w) != *m.at_base(r, w)) + { + return false; + } + } + } + return true; + } + + bool operator==(const gfp_t base) const + { + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < width; w++) + { + if (*at_base(r, w) != base) + { + return false; + } + } + } + return true; + } + void operator+=(const MatGFP &m) + { + assert(nrows == m.nrows && ncols == m.ncols); + for (size_t r = 0; r < nrows; r++) + { + for (size_t w = 0; w < width; w++) + { + *at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfprime; } } } - return true; - } - void operator+=(const MatGFP &m) - { - assert(nrows == m.nrows && ncols == m.ncols); - for (size_t r = 0; r < nrows; r++) + MatGFP operator+(const MatGFP &m) const { - for (size_t w = 0; w < width; w++) - { - *at_base(r, w) = (*at_base(r, w) + *m.at_base(r, w)) % gfp; - } + MatGFP temp(*this); + temp += m; + return temp; } - } - MatGFP operator+(const MatGFP &m) const - { - MatGFP temp(*this); - temp += m; - return temp; - } - // a(m*k)*b(k,n) 其中k不超过65536 - void cpu_addmul(const MatGFP &a, const MatGFP &b) - { - assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); - assert(a.ncols <= 65536); - for (size_t r = 0; r < nrows; r++) + // a(m*k)*b(k,n) 其中k不超过65536 + void cpu_addmul(const MatGFP &a, const MatGFP &b) { - for (size_t w = 0; w < width; w++) + assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); + assert(a.ncols <= 65536); + for (size_t r = 0; r < nrows; r++) { - for (size_t i = 0; i < a.ncols; i++) + for (size_t w = 0; w < width; w++) { - *at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfp; + for (size_t i = 0; i < a.ncols; i++) + { + *at_base(r, w) += (*a.at_base(r, i) * *b.at_base(i, w)) % gfprime; + } + *at_base(r, w) %= gfprime; } - *at_base(r, w) %= gfp; } } - } - void gpu_mul(const MatGFP &a, const MatGFP &b); + void gpu_mul(const MatGFP &a, const MatGFP &b); - MatGFP operator*(const MatGFP &m) const - { - MatGFP temp(nrows, m.width); - temp.gpu_mul(*this, m); - return temp; - } - - // void cpu_swap_row(size_t r1, size_t r2); - // ElimResult gpu_elim(); - - friend ostream &operator<<(ostream &out, const MatGFP &m); - - size_t nrows, ncols, width; - -private: - MatGFP() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {} - - size_t rowstride; - MatType type; - gfp_t *data; -}; - -ostream &operator<<(ostream &out, const MatGFP &m) -{ - for (size_t r = 0; r < m.nrows; r++) - { - for (size_t w = 0; w < m.width; w++) + MatGFP operator*(const MatGFP &m) const { -#if gfp_bits == 64 - printf("%05lu ", *m.at_base(r, w)); -#else - printf("%05u ", *m.at_base(r, w)); -#endif + MatGFP temp(nrows, m.width); + temp.gpu_mul(*this, m); + return temp; } - printf("\n"); + + // void cpu_swap_row(size_t r1, size_t r2); + // ElimResult gpu_elim(); + + friend ostream &operator<<(ostream &out, const MatGFP &m); + + size_t nrows, ncols, width; + + private: + MatGFP() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {} + + size_t rowstride; + MatType type; + gfp_t *data; + }; + + ostream &operator<<(ostream &out, const MatGFP &m) + { + for (size_t r = 0; r < m.nrows; r++) + { + for (size_t w = 0; w < m.width; w++) + { + printf("%05u ", *m.at_base(r, w)); + } + printf("\n"); + } + return out; } - return out; } #endif \ No newline at end of file diff --git a/include/gfp/gfp_mul.cuh b/include/gfp/gfp_mul.cuh index d520215..2050a05 100644 --- a/include/gfp/gfp_mul.cuh +++ b/include/gfp/gfp_mul.cuh @@ -3,90 +3,83 @@ #include "gfp_mat.cuh" -static const int BlockRow = 128, BlockCol = 128; // 每个block处理c矩阵的一个子块 -static const int StepSize = 8; // block中一个循环处理的A矩阵的列数(B矩阵的行数) - -static_assert(BlockCol % THREAD_X == 0 && BlockRow % THREAD_Y == 0); - -__global__ void gfp_gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps) +namespace gfp { - const unsigned int bx = blockIdx.x; - const unsigned int by = blockIdx.y; - const unsigned int tx = threadIdx.x; - const unsigned int ty = threadIdx.y; - const unsigned int tid = ty * blockDim.x + tx; + static const int BlockRow = 128, BlockCol = 128; // 每个block处理c矩阵的一个子块 + static const int StepSize = 8; // block中一个循环处理的A矩阵的列数(B矩阵的行数) -#if gfp_bits == 64 - __shared__ alignas(8) gfp_t s_a[StepSize][BlockRow]; - __shared__ alignas(8) gfp_t s_b[StepSize][BlockCol]; -#else - __shared__ gfp_t s_a[StepSize][BlockRow]; - __shared__ gfp_t s_b[StepSize][BlockCol]; -#endif + static_assert(BlockCol % THREAD_X == 0 && BlockRow % THREAD_Y == 0); - gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0}; - - for (int s = 0; s < (nsteps - 1) / StepSize + 1; s++) + __global__ void gpu_mul_kernel(gfp_t *__restrict__ a, const size_t a_rs, gfp_t *__restrict__ b, const size_t b_rs, gfp_t *__restrict__ c, const size_t c_rs, const size_t nrows, const size_t ncols, const size_t nsteps) { - for (int k = tid; k < StepSize * BlockRow; k += blockDim.x * blockDim.y) + + const unsigned int bx = blockIdx.x; + const unsigned int by = blockIdx.y; + const unsigned int tx = threadIdx.x; + const unsigned int ty = threadIdx.y; + const unsigned int tid = ty * blockDim.x + tx; + + __shared__ gfp_t s_a[StepSize][BlockRow]; + __shared__ gfp_t s_b[StepSize][BlockCol]; + + gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0}; + + for (int s = 0; s < (nsteps - 1) / StepSize + 1; s++) { - const int a_r = k / StepSize; - const int a_c = k % StepSize; - s_a[a_c][a_r] = by * BlockRow + a_r < nrows && s * StepSize + a_c < nsteps ? *at_base(a, a_rs, by * BlockRow + a_r, s * StepSize + a_c) : 0; - const int b_r = k / BlockCol; - const int b_c = k % BlockCol; - s_b[b_r][b_c] = s * StepSize + b_r < nsteps && bx * BlockCol + b_c < ncols ? *at_base(b, b_rs, s * StepSize + b_r, bx * BlockCol + b_c) : 0; - } - __syncthreads(); - for (int k = 0; k < StepSize; k++) - { - for (int j = 0; j < BlockRow / THREAD_Y; j++) + for (int k = tid; k < StepSize * BlockRow; k += blockDim.x * blockDim.y) { - for (int i = 0; i < BlockCol / THREAD_X; i++) + const int a_r = k / StepSize; + const int a_c = k % StepSize; + s_a[a_c][a_r] = by * BlockRow + a_r < nrows && s * StepSize + a_c < nsteps ? *at_base(a, a_rs, by * BlockRow + a_r, s * StepSize + a_c) : 0; + const int b_r = k / BlockCol; + const int b_c = k % BlockCol; + s_b[b_r][b_c] = s * StepSize + b_r < nsteps && bx * BlockCol + b_c < ncols ? *at_base(b, b_rs, s * StepSize + b_r, bx * BlockCol + b_c) : 0; + } + __syncthreads(); + for (int k = 0; k < StepSize; k++) + { + for (int j = 0; j < BlockRow / THREAD_Y; j++) { -#if gfp_bits == 64 - tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]); -#else - tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfp; -#endif + for (int i = 0; i < BlockCol / THREAD_X; i++) + { + tmp_c[j][i] += (s_a[k][j * THREAD_Y + ty] * s_b[k][i * THREAD_X + tx]) % gfprime; + } + } + } + __syncthreads(); + if (s & gfp_fullmask == gfp_fullmask) + { + for (int j = 0; j < BlockRow / THREAD_Y; j++) + { + for (int i = 0; i < BlockCol / THREAD_X; i++) + { + tmp_c[j][i] %= gfprime; + } } } } - __syncthreads(); -#if gfp_bits != 64 - if (s & gfp_fullmask == gfp_fullmask) + for (int j = 0; j < BlockRow / THREAD_Y; j++) { - for (int j = 0; j < BlockRow / THREAD_Y; j++) + for (int i = 0; i < BlockCol / THREAD_X; i++) { - for (int i = 0; i < BlockCol / THREAD_X; i++) + if (by * BlockRow + j * THREAD_Y + ty < nrows && bx * BlockCol + i * THREAD_X + tx < ncols) { - tmp_c[j][i] %= gfp; + *at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfprime; } } } -#endif } - for (int j = 0; j < BlockRow / THREAD_Y; j++) + + __host__ void MatGFP::gpu_mul(const MatGFP &a, const MatGFP &b) { - for (int i = 0; i < BlockCol / THREAD_X; i++) - { - if (by * BlockRow + j * THREAD_Y + ty < nrows && bx * BlockCol + i * THREAD_X + tx < ncols) - { - *at_base(c, c_rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx) = tmp_c[j][i] % gfp; - } - } + assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); + + dim3 block(THREAD_X, THREAD_Y); + dim3 grid((width - 1) / block.x + 1, (nrows - 1) / block.y + 1); + gpu_mul_kernel<<>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width); + cudaDeviceSynchronize(); } } -__host__ void MatGFP::gpu_mul(const MatGFP &a, const MatGFP &b) -{ - assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); - - dim3 block(THREAD_X, THREAD_Y); - dim3 grid((width - 1) / block.x + 1, (nrows - 1) / block.y + 1); - gfp_gpu_mul_kernel<<>>(a.data, a.rowstride, b.data, b.rowstride, data, rowstride, nrows, width, a.width); - cudaDeviceSynchronize(); -} - #endif diff --git a/include/header.cuh b/include/header.cuh index a2bcb91..10bfa0b 100755 --- a/include/header.cuh +++ b/include/header.cuh @@ -6,14 +6,6 @@ #include -// matrix -// #include -// #include - -// #include -// #include -// #include - using namespace std; using base_t = uint64_t; @@ -25,13 +17,8 @@ static const base_t base_one = (base_t)0x00'00'00'00'00'00'00'01; static const base_t base_fullmask = (base_t)0xFF'FF'FF'FF'FF'FF'FF'FF; -static const size_t THREAD_X = 32; // 列 -static const size_t THREAD_Y = 8; // 行 - -// __host__ __device__ base_t *at_base(base_t *base, size_t rowstride, size_t r, size_t w) -// { -// return base + r * rowstride + w; -// } +static const size_t THREAD_X = 16; // 列 +static const size_t THREAD_Y = 16; // 行 template __host__ __device__ T *at_base(T *base, size_t rowstride, size_t r, size_t w) diff --git a/src/main.cu b/src/main.cu index e7f51c8..dfacffe 100644 --- a/src/main.cu +++ b/src/main.cu @@ -4,6 +4,8 @@ #undef SHOW_PROGRESS_BAR +using namespace gfp; + int main() { int m = 1000, k = 1000, n = 1000; diff --git a/test/test_gf256_elim.cu b/test/test_gf256_elim.cu index ad706d9..4959498 100644 --- a/test/test_gf256_elim.cu +++ b/test/test_gf256_elim.cu @@ -1,6 +1,8 @@ #include #include "test_header.cuh" +using namespace gf256; + bool test_gf256_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, const GF256 &gf256, uint_fast32_t seed) { assert(rank <= nrows && rank <= rank_col && rank_col <= ncols); diff --git a/test/test_gf256_header.cu b/test/test_gf256_header.cu index 05b851d..3ada401 100644 --- a/test/test_gf256_header.cu +++ b/test/test_gf256_header.cu @@ -1,6 +1,8 @@ #include #include "test_header.cuh" +using namespace gf256; + vector expect_inv_table{ 0x00, 0x01, 0x8E, 0xF4, 0x47, 0xA7, 0x7A, 0xBA, 0xAD, 0x9D, 0xDD, 0x98, 0x3D, 0xAA, 0x5D, 0x96, 0xD8, 0x72, 0xC0, 0x58, 0xE0, 0x3E, 0x4C, 0x66, 0x90, 0xDE, 0x55, 0x80, 0xA0, 0x83, 0x4B, 0x2A, diff --git a/test/test_gf256_matrix.cu b/test/test_gf256_matrix.cu index 4ca6d37..fab5106 100644 --- a/test/test_gf256_matrix.cu +++ b/test/test_gf256_matrix.cu @@ -1,6 +1,8 @@ #include #include "test_header.cuh" +using namespace gf256; + TEST(TestGF256Matrix, Equal) { MatGF256 a(50, 50); diff --git a/test/test_gfp_mul.cu b/test/test_gfp_mul.cu index 0c819d5..2341fcc 100644 --- a/test/test_gfp_mul.cu +++ b/test/test_gfp_mul.cu @@ -1,6 +1,8 @@ #include #include "test_header.cuh" +using namespace gfp; + bool test_gfp_mul(size_t m, size_t k, size_t n, uint_fast32_t seed) { MatGFP a(m, k);