From 011176d64c7bdea7a6576f67c3a5368300a57d32 Mon Sep 17 00:00:00 2001 From: shijin Date: Thu, 19 Sep 2024 15:59:53 +0800 Subject: [PATCH] =?UTF-8?q?gfp=E9=AB=98=E6=96=AF=E6=B6=88=E5=85=83?= =?UTF-8?q?=E5=AE=8C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/cuelim.cuh | 1 + include/gf256/gf256_elim.cuh | 16 --- include/gf256/gf256_mat.cuh | 19 ++- include/gfp/gfp_elim.cuh | 237 +++++++++++++++++++++++++++++++++++ include/gfp/gfp_header.cuh | 23 ++++ include/gfp/gfp_mat.cuh | 51 +++++++- src/main.cu | 23 ++-- 7 files changed, 339 insertions(+), 31 deletions(-) create mode 100644 include/gfp/gfp_elim.cuh diff --git a/include/cuelim.cuh b/include/cuelim.cuh index 68a4fbc..d0bce0c 100644 --- a/include/cuelim.cuh +++ b/include/cuelim.cuh @@ -5,5 +5,6 @@ #include "gf256/gf256_elim.cuh" #include "gfp/gfp_mul.cuh" +#include "gfp/gfp_elim.cuh" #endif \ No newline at end of file diff --git a/include/gf256/gf256_elim.cuh b/include/gf256/gf256_elim.cuh index 2e352c9..3258945 100644 --- a/include/gf256/gf256_elim.cuh +++ b/include/gf256/gf256_elim.cuh @@ -5,22 +5,6 @@ namespace gf256 { - void MatGF256::cpu_swap_row(size_t r1, size_t r2) - { - if (r1 == r2) - { - return; - } - base_t *p1 = at_base(r1, 0); - base_t *p2 = at_base(r2, 0); - for (size_t i = 0; i < width; i++) - { - base_t temp = p1[i]; - p1[i] = p2[i]; - p2[i] = temp; - } - } - size_t cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector &p_col, vector &p_row, const GF256 &gf) { size_t rank = 0; diff --git a/include/gf256/gf256_mat.cuh b/include/gf256/gf256_mat.cuh index 4d14593..1cb7b97 100755 --- a/include/gf256/gf256_mat.cuh +++ b/include/gf256/gf256_mat.cuh @@ -195,9 +195,6 @@ namespace gf256 void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf); - // size_t cpu_elim_base(base_t *base_col, size_t st_r, size_t w, vector &p_col, vector &p_row, base_t step[gf256_num], const GF256 &gf); - void cpu_swap_row(size_t r1, size_t r2); - // void cpu_mul_row(size_t r, gf256_t val, const GF256 &gf); ElimResult gpu_elim(const GF256 &gf); friend ostream &operator<<(ostream &out, const MatGF256 &m); @@ -207,6 +204,22 @@ namespace gf256 private: MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {} + void cpu_swap_row(size_t r1, size_t r2) + { + if (r1 == r2) + { + return; + } + base_t *p1 = at_base(r1, 0); + base_t *p2 = at_base(r2, 0); + for (size_t i = 0; i < width; i++) + { + base_t temp = p1[i]; + p1[i] = p2[i]; + p2[i] = temp; + } + } + size_t rowstride; MatType type; base_t *data; diff --git a/include/gfp/gfp_elim.cuh b/include/gfp/gfp_elim.cuh new file mode 100644 index 0000000..7266b75 --- /dev/null +++ b/include/gfp/gfp_elim.cuh @@ -0,0 +1,237 @@ +#ifndef GFP_ELIM_CUH +#define GFP_ELIM_CUH + +#include "gfp_mat.cuh" + +static const size_t gfp_block_size = 8; + +namespace gfp +{ + ElimResult MatGFP::cpu_elim() + { + init_inv_table(); + size_t rank = 0; + vector pivot; + vector swap_row; + for (size_t w = 0; w < width; w++) + { + if (*at_base(rank, w) != gfp_zero) + { + swap_row.push_back(rank); + } + else + { + for (size_t r = rank + 1; r < nrows; r++) + { + if (*at_base(r, w) != gfp_zero) + { + swap_row.push_back(r); + cpu_swap_row(rank, r); + break; + } + } + continue; + } + pivot.push_back(w); + + gfp_t inv = gfp_inv(*at_base(rank, w)); + + for (size_t wi = w; wi < width; wi++) + { + *at_base(rank, wi) = (*at_base(rank, wi) * inv) % gfprime; + } + + for (size_t r = 0; r < nrows; r++) + { + if (r == rank) + { + continue; + } + + gfp_t mul = *at_base(r, w); + + for (size_t wi = w; wi < width; wi++) + { + *at_base(r, wi) = (*at_base(r, wi) + (*at_base(rank, wi) * (gfprime - mul))) % gfprime; + } + } + + rank++; + if (rank == nrows) + { + break; + } + } + return {rank, pivot, swap_row}; + } + + size_t cpu_elim_base(MatGFP &submat, base_t submat_nrows, size_t st_r, size_t w, vector &p_col, vector &p_row) + { + size_t rank = 0; + size_t pivot[gfp_block_size]; + size_t next[gfp_block_size]; + for (size_t pivot_col = 0; pivot_col < gfp_block_size; pivot_col++) + { + for (size_t r = rank; r < submat_nrows; r++) + { + for (size_t i = 0; i < rank; i++) + { + if (next[i] == r) + { + gfp_t mul = *submat.at_base(r, pivot[i]); + for (size_t j = pivot[i] + 1; j < gfp_block_size; j++) + { + *submat.at_base(r, j) = (*submat.at_base(r, j) + *submat.at_base(i, j) * (gfprime - mul)) % gfprime; + } + next[i]++; + } + } + + if (*submat.at_base(r, pivot_col) != 0) + { + p_col.push_back(w + pivot_col); + p_row.push_back(st_r + r); + if (r != rank) + { + submat.cpu_swap_row(rank, r); + } + gfp_t inv = gfp_inv(*submat.at_base(rank, pivot_col)); + for (size_t j = pivot_col + 1; j < gfp_block_size; j++) + { + *submat.at_base(rank, j) = (inv * (*submat.at_base(rank, j))) % gfprime; + } + pivot[rank] = pivot_col; + next[rank] = rank + 1; + rank++; + break; + } + } + } + return rank; + } + + __global__ void gpu_elim_kernel(gfp_t *__restrict__ idx, const size_t idx_rs, gfp_t *__restrict__ data, const size_t rs, const size_t st_src, const size_t rank, const gfp_t pivot_base, const size_t nrows, const size_t ncols, const size_t nsteps) + { + + const unsigned int bx = blockIdx.x; + const unsigned int by = blockIdx.y; + const unsigned int tx = threadIdx.x; + const unsigned int ty = threadIdx.y; + const unsigned int tid = ty * blockDim.x + tx; + + __shared__ gfp_t s_idx[StepSize][BlockRow]; + __shared__ gfp_t s_src[StepSize][BlockCol]; + + gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0}; + for (int k = tid; k < StepSize * BlockRow; k += blockDim.x * blockDim.y) + { + const int a_r = k / StepSize; + const int a_c = k % StepSize; + s_idx[a_c][a_r] = (by * BlockRow + a_r < nrows && a_c < nsteps) ? *at_base(idx, idx_rs, by * BlockRow + a_r, a_c) : 0; + const int b_r = k / BlockCol; + const int b_c = k % BlockCol; + s_src[b_r][b_c] = (b_r < rank && bx * BlockCol + b_c < ncols) ? *at_base(data, rs, st_src + b_r, bx * BlockCol + b_c) : 0; + s_src[b_r][b_c] = s_src[b_r][b_c] ? gfprime - s_src[b_r][b_c] : 0; + } + __syncthreads(); + // if (bx == 0 && by == 0 && tid == 0) + // { + // for (int i = 0; i < StepSize; i++) + // { + // for (int j = 0; j < BlockRow; j++) + // { + // printf("%05d ", s_idx[i][j]); + // } + // printf("\n"); + // } + // for (int i = 0; i < StepSize; i++) + // { + // for (int j = 0; j < BlockCol; j++) + // { + // printf("%05d ", s_src[i][j]); + // } + // printf("\n"); + // } + // } + for (int k = 0; k < rank; k++) + { + for (int j = 0; j < BlockRow / THREAD_Y; j++) + { + for (int i = 0; i < BlockCol / THREAD_X; i++) + { + tmp_c[j][i] += (s_idx[get4(pivot_base, k)][j * THREAD_Y + ty] * s_src[k][i * THREAD_X + tx]) % gfprime; + } + } + } + + for (int j = 0; j < BlockRow / THREAD_Y; j++) + { + for (int i = 0; i < BlockCol / THREAD_X; i++) + { + if (by * BlockRow + j * THREAD_Y + ty < nrows && (by * BlockRow + j * THREAD_Y + ty < st_src || by * BlockRow + j * THREAD_Y + ty >= st_src + rank) && bx * BlockCol + i * THREAD_X + tx < ncols) + { + gfp_t *dst = at_base(data, rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx); + *dst = (*dst + tmp_c[j][i]) % gfprime; + } + } + } + } + + __host__ ElimResult MatGFP::gpu_elim() + { + init_inv_table(); + + MatGFP submat(nrows, gfp_block_size); + + size_t rank = 0; + vector p_col, p_row; + + progress::ProgressBar pb("GPU ELIMINATE", (width - 1) / gfp_block_size + 1); + for (size_t w = 0; w < (width - 1) / gfp_block_size + 1; w++, pb.tick_display()) + { + size_t st_w = w * gfp_block_size; + size_t ed_w = min(st_w + gfp_block_size, width); + + submat.deepcopy(*this, rank, st_w, nrows, ed_w); + + size_t src_rank = cpu_elim_base(submat, nrows - rank, rank, st_w, p_col, p_row); + + if (src_rank == 0) + { + continue; + } + gfp_t pivot_base = gfp_zero; + for (size_t r = rank; r < rank + src_rank; r++) + { + cpu_swap_row(r, p_row[r]); + for (size_t i = rank; i < r; i++) + { + cpu_addmul_row(r, i, gfprime - *at_base(r, p_col[i]), p_col[i]); + } + cpu_mul_row(r, gfp_inv(*at_base(r, p_col[r])), p_col[r]); + for (size_t i = rank; i < r; i++) + { + cpu_addmul_row(i, r, gfprime - *at_base(i, p_col[r]), p_col[r]); + } + set4(pivot_base, p_col[r] - st_w, r - rank); + } + + submat.deepcopy(*this, 0, st_w, nrows, ed_w); + + dim3 block(THREAD_X, THREAD_Y); + dim3 grid((width - st_w - 1) / block.x + 1, (nrows - 1) / block.y + 1); + gpu_elim_kernel<<>>(submat.data, submat.rowstride, at_base(0, st_w), rowstride, rank, src_rank, pivot_base, nrows, width - st_w, gfp_block_size); + cudaDeviceSynchronize(); + + rank += src_rank; + + if (rank == nrows) + { + break; + } + } + return {rank, p_col, p_row}; + } +} + +#endif \ No newline at end of file diff --git a/include/gfp/gfp_header.cuh b/include/gfp/gfp_header.cuh index 830479a..3470866 100644 --- a/include/gfp/gfp_header.cuh +++ b/include/gfp/gfp_header.cuh @@ -20,8 +20,31 @@ namespace gfp gfp_inv_table[0] = 0; gfp_inv_table[1] = 1; for (int i = 2; i < gfprime; ++i) + { gfp_inv_table[i] = (gfprime - gfprime / i) * gfp_inv_table[gfprime % i] % gfprime; + } } + gfp_t gfp_inv(gfp_t a) + { + return gfp_inv_table[a]; + } + + __host__ __device__ inline size_t offset4(size_t idx) + { + return idx << 2; + } + + __host__ __device__ inline size_t get4(gfp_t src, size_t idx) + { + return (src >> offset4(idx)) & 0xF; + } + + // 确保set8对应位置的值为0 + __host__ __device__ inline void set4(gfp_t &dst, size_t src, size_t idx) + { + dst |= (gfp_t)src << offset4(idx); + } + } #endif \ No newline at end of file diff --git a/include/gfp/gfp_mat.cuh b/include/gfp/gfp_mat.cuh index 69c7fff..8252d03 100644 --- a/include/gfp/gfp_mat.cuh +++ b/include/gfp/gfp_mat.cuh @@ -9,6 +9,13 @@ namespace gfp { + struct ElimResult + { + size_t rank; + vector pivot; + vector swap_row; + }; + class MatGFP { public: @@ -76,6 +83,8 @@ namespace gfp { if (type == root) { + // cout << nrows << " " << ncols << endl; + // cout << *data << endl; CUDA_CHECK(cudaFree(data)); } } @@ -99,6 +108,12 @@ namespace gfp } } + void deepcopy(const MatGFP &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj) + { + assert(end_rj - begin_ri <= nrows && end_wj - begin_wi <= width); + CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), src.at_base(begin_ri, begin_wi), src.rowstride * sizeof(gfp_t), (end_wj - begin_wi) * sizeof(gfp_t), end_rj - begin_ri, cudaMemcpyDefault)); + } + // 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列 void randomize(size_t rank_col, uint_fast32_t seed) { @@ -209,8 +224,40 @@ namespace gfp return temp; } - // void cpu_swap_row(size_t r1, size_t r2); - // ElimResult gpu_elim(); + void cpu_swap_row(size_t r1, size_t r2) + { + if (r1 == r2) + { + return; + } + gfp_t *p1 = at_base(r1, 0); + gfp_t *p2 = at_base(r2, 0); + for (size_t i = 0; i < width; i++) + { + gfp_t temp = p1[i]; + p1[i] = p2[i]; + p2[i] = temp; + } + } + + void cpu_mul_row(size_t r, gfp_t mul, size_t begin_w = 0, size_t end_w = 0) + { + for (size_t i = begin_w; i < (end_w ? end_w : width); i++) + { + *at_base(r, i) = (*at_base(r, i) * mul) % gfprime; + } + } + + void cpu_addmul_row(size_t r1, size_t r2, gfp_t mul, size_t begin_w = 0, size_t end_w = 0) + { + for (size_t i = begin_w; i < (end_w ? end_w : width); i++) + { + *at_base(r1, i) = (*at_base(r1, i) + *at_base(r2, i) * mul) % gfprime; + } + } + + ElimResult gpu_elim(); + ElimResult cpu_elim(); friend ostream &operator<<(ostream &out, const MatGFP &m); diff --git a/src/main.cu b/src/main.cu index dfacffe..e67ed34 100644 --- a/src/main.cu +++ b/src/main.cu @@ -6,16 +6,19 @@ using namespace gfp; +bool test_gfp_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, uint_fast32_t seed) +{ + MatGFP rdc(rank, ncols); + rdc.randomize(rank_col, seed); + MatGFP mix(nrows, rank); + mix.randomize(seed); + MatGFP a = mix * rdc; + ElimResult res = a.gpu_elim(); + MatGFP win(a, 0, 0, res.rank, a.width); + return rdc == win; +} + int main() { - int m = 1000, k = 1000, n = 1000; - MatGFP a(m, k); - MatGFP b(k, n); - MatGFP c(m, n); - a.randomize(10); - b.randomize(10); - c.cpu_addmul(a, b); - MatGFP d(m, n); - d.gpu_mul(a, b); - cout << (c == d) << endl; + cout << test_gfp_elim(1234, 2345, 3456, 4567, 41921095) << endl; } \ No newline at end of file