From 011176d64c7bdea7a6576f67c3a5368300a57d32 Mon Sep 17 00:00:00 2001
From: shijin <sj_shijin@outlook.com>
Date: Thu, 19 Sep 2024 15:59:53 +0800
Subject: [PATCH] =?UTF-8?q?gfp=E9=AB=98=E6=96=AF=E6=B6=88=E5=85=83?=
 =?UTF-8?q?=E5=AE=8C=E6=88=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/cuelim.cuh           |   1 +
 include/gf256/gf256_elim.cuh |  16 ---
 include/gf256/gf256_mat.cuh  |  19 ++-
 include/gfp/gfp_elim.cuh     | 237 +++++++++++++++++++++++++++++++++++
 include/gfp/gfp_header.cuh   |  23 ++++
 include/gfp/gfp_mat.cuh      |  51 +++++++-
 src/main.cu                  |  23 ++--
 7 files changed, 339 insertions(+), 31 deletions(-)
 create mode 100644 include/gfp/gfp_elim.cuh

diff --git a/include/cuelim.cuh b/include/cuelim.cuh
index 68a4fbc..d0bce0c 100644
--- a/include/cuelim.cuh
+++ b/include/cuelim.cuh
@@ -5,5 +5,6 @@
 #include "gf256/gf256_elim.cuh"
 
 #include "gfp/gfp_mul.cuh"
+#include "gfp/gfp_elim.cuh"
 
 #endif
\ No newline at end of file
diff --git a/include/gf256/gf256_elim.cuh b/include/gf256/gf256_elim.cuh
index 2e352c9..3258945 100644
--- a/include/gf256/gf256_elim.cuh
+++ b/include/gf256/gf256_elim.cuh
@@ -5,22 +5,6 @@
 
 namespace gf256
 {
-    void MatGF256::cpu_swap_row(size_t r1, size_t r2)
-    {
-        if (r1 == r2)
-        {
-            return;
-        }
-        base_t *p1 = at_base(r1, 0);
-        base_t *p2 = at_base(r2, 0);
-        for (size_t i = 0; i < width; i++)
-        {
-            base_t temp = p1[i];
-            p1[i] = p2[i];
-            p2[i] = temp;
-        }
-    }
-
     size_t cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, const GF256 &gf)
     {
         size_t rank = 0;
diff --git a/include/gf256/gf256_mat.cuh b/include/gf256/gf256_mat.cuh
index 4d14593..1cb7b97 100755
--- a/include/gf256/gf256_mat.cuh
+++ b/include/gf256/gf256_mat.cuh
@@ -195,9 +195,6 @@ namespace gf256
         void gpu_addmul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
         friend MatGF256 gpu_mul(const MatGF256 &a, const MatGF256 &b, const GF256 &gf);
 
-        // size_t cpu_elim_base(base_t *base_col, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row, base_t step[gf256_num], const GF256 &gf);
-        void cpu_swap_row(size_t r1, size_t r2);
-        // void cpu_mul_row(size_t r, gf256_t val, const GF256 &gf);
         ElimResult gpu_elim(const GF256 &gf);
 
         friend ostream &operator<<(ostream &out, const MatGF256 &m);
@@ -207,6 +204,22 @@ namespace gf256
     private:
         MatGF256() : nrows(0), ncols(0), width(0), rowstride(0), type(moved), data(nullptr) {}
 
+        void cpu_swap_row(size_t r1, size_t r2)
+        {
+            if (r1 == r2)
+            {
+                return;
+            }
+            base_t *p1 = at_base(r1, 0);
+            base_t *p2 = at_base(r2, 0);
+            for (size_t i = 0; i < width; i++)
+            {
+                base_t temp = p1[i];
+                p1[i] = p2[i];
+                p2[i] = temp;
+            }
+        }
+
         size_t rowstride;
         MatType type;
         base_t *data;
diff --git a/include/gfp/gfp_elim.cuh b/include/gfp/gfp_elim.cuh
new file mode 100644
index 0000000..7266b75
--- /dev/null
+++ b/include/gfp/gfp_elim.cuh
@@ -0,0 +1,237 @@
+#ifndef GFP_ELIM_CUH
+#define GFP_ELIM_CUH
+
+#include "gfp_mat.cuh"
+
+static const size_t gfp_block_size = 8;
+
+namespace gfp
+{
+    ElimResult MatGFP::cpu_elim()
+    {
+        init_inv_table();
+        size_t rank = 0;
+        vector<size_t> pivot;
+        vector<size_t> swap_row;
+        for (size_t w = 0; w < width; w++)
+        {
+            if (*at_base(rank, w) != gfp_zero)
+            {
+                swap_row.push_back(rank);
+            }
+            else
+            {
+                for (size_t r = rank + 1; r < nrows; r++)
+                {
+                    if (*at_base(r, w) != gfp_zero)
+                    {
+                        swap_row.push_back(r);
+                        cpu_swap_row(rank, r);
+                        break;
+                    }
+                }
+                continue;
+            }
+            pivot.push_back(w);
+
+            gfp_t inv = gfp_inv(*at_base(rank, w));
+
+            for (size_t wi = w; wi < width; wi++)
+            {
+                *at_base(rank, wi) = (*at_base(rank, wi) * inv) % gfprime;
+            }
+
+            for (size_t r = 0; r < nrows; r++)
+            {
+                if (r == rank)
+                {
+                    continue;
+                }
+
+                gfp_t mul = *at_base(r, w);
+
+                for (size_t wi = w; wi < width; wi++)
+                {
+                    *at_base(r, wi) = (*at_base(r, wi) + (*at_base(rank, wi) * (gfprime - mul))) % gfprime;
+                }
+            }
+
+            rank++;
+            if (rank == nrows)
+            {
+                break;
+            }
+        }
+        return {rank, pivot, swap_row};
+    }
+
+    size_t cpu_elim_base(MatGFP &submat, base_t submat_nrows, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row)
+    {
+        size_t rank = 0;
+        size_t pivot[gfp_block_size];
+        size_t next[gfp_block_size];
+        for (size_t pivot_col = 0; pivot_col < gfp_block_size; pivot_col++)
+        {
+            for (size_t r = rank; r < submat_nrows; r++)
+            {
+                for (size_t i = 0; i < rank; i++)
+                {
+                    if (next[i] == r)
+                    {
+                        gfp_t mul = *submat.at_base(r, pivot[i]);
+                        for (size_t j = pivot[i] + 1; j < gfp_block_size; j++)
+                        {
+                            *submat.at_base(r, j) = (*submat.at_base(r, j) + *submat.at_base(i, j) * (gfprime - mul)) % gfprime;
+                        }
+                        next[i]++;
+                    }
+                }
+
+                if (*submat.at_base(r, pivot_col) != 0)
+                {
+                    p_col.push_back(w + pivot_col);
+                    p_row.push_back(st_r + r);
+                    if (r != rank)
+                    {
+                        submat.cpu_swap_row(rank, r);
+                    }
+                    gfp_t inv = gfp_inv(*submat.at_base(rank, pivot_col));
+                    for (size_t j = pivot_col + 1; j < gfp_block_size; j++)
+                    {
+                        *submat.at_base(rank, j) = (inv * (*submat.at_base(rank, j))) % gfprime;
+                    }
+                    pivot[rank] = pivot_col;
+                    next[rank] = rank + 1;
+                    rank++;
+                    break;
+                }
+            }
+        }
+        return rank;
+    }
+
+    __global__ void gpu_elim_kernel(gfp_t *__restrict__ idx, const size_t idx_rs, gfp_t *__restrict__ data, const size_t rs, const size_t st_src, const size_t rank, const gfp_t pivot_base, const size_t nrows, const size_t ncols, const size_t nsteps)
+    {
+
+        const unsigned int bx = blockIdx.x;
+        const unsigned int by = blockIdx.y;
+        const unsigned int tx = threadIdx.x;
+        const unsigned int ty = threadIdx.y;
+        const unsigned int tid = ty * blockDim.x + tx;
+
+        __shared__ gfp_t s_idx[StepSize][BlockRow];
+        __shared__ gfp_t s_src[StepSize][BlockCol];
+
+        gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0};
+        for (int k = tid; k < StepSize * BlockRow; k += blockDim.x * blockDim.y)
+        {
+            const int a_r = k / StepSize;
+            const int a_c = k % StepSize;
+            s_idx[a_c][a_r] = (by * BlockRow + a_r < nrows && a_c < nsteps) ? *at_base(idx, idx_rs, by * BlockRow + a_r, a_c) : 0;
+            const int b_r = k / BlockCol;
+            const int b_c = k % BlockCol;
+            s_src[b_r][b_c] = (b_r < rank && bx * BlockCol + b_c < ncols) ? *at_base(data, rs, st_src + b_r, bx * BlockCol + b_c) : 0;
+            s_src[b_r][b_c] = s_src[b_r][b_c] ? gfprime - s_src[b_r][b_c] : 0;
+        }
+        __syncthreads();
+        // if (bx == 0 && by == 0 && tid == 0)
+        // {
+        //     for (int i = 0; i < StepSize; i++)
+        //     {
+        //         for (int j = 0; j < BlockRow; j++)
+        //         {
+        //             printf("%05d ", s_idx[i][j]);
+        //         }
+        //         printf("\n");
+        //     }
+        //     for (int i = 0; i < StepSize; i++)
+        //     {
+        //         for (int j = 0; j < BlockCol; j++)
+        //         {
+        //             printf("%05d ", s_src[i][j]);
+        //         }
+        //         printf("\n");
+        //     }
+        // }
+        for (int k = 0; k < rank; k++)
+        {
+            for (int j = 0; j < BlockRow / THREAD_Y; j++)
+            {
+                for (int i = 0; i < BlockCol / THREAD_X; i++)
+                {
+                    tmp_c[j][i] += (s_idx[get4(pivot_base, k)][j * THREAD_Y + ty] * s_src[k][i * THREAD_X + tx]) % gfprime;
+                }
+            }
+        }
+
+        for (int j = 0; j < BlockRow / THREAD_Y; j++)
+        {
+            for (int i = 0; i < BlockCol / THREAD_X; i++)
+            {
+                if (by * BlockRow + j * THREAD_Y + ty < nrows && (by * BlockRow + j * THREAD_Y + ty < st_src || by * BlockRow + j * THREAD_Y + ty >= st_src + rank) && bx * BlockCol + i * THREAD_X + tx < ncols)
+                {
+                    gfp_t *dst = at_base(data, rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx);
+                    *dst = (*dst + tmp_c[j][i]) % gfprime;
+                }
+            }
+        }
+    }
+
+    __host__ ElimResult MatGFP::gpu_elim()
+    {
+        init_inv_table();
+
+        MatGFP submat(nrows, gfp_block_size);
+
+        size_t rank = 0;
+        vector<size_t> p_col, p_row;
+
+        progress::ProgressBar pb("GPU ELIMINATE", (width - 1) / gfp_block_size + 1);
+        for (size_t w = 0; w < (width - 1) / gfp_block_size + 1; w++, pb.tick_display())
+        {
+            size_t st_w = w * gfp_block_size;
+            size_t ed_w = min(st_w + gfp_block_size, width);
+
+            submat.deepcopy(*this, rank, st_w, nrows, ed_w);
+
+            size_t src_rank = cpu_elim_base(submat, nrows - rank, rank, st_w, p_col, p_row);
+
+            if (src_rank == 0)
+            {
+                continue;
+            }
+            gfp_t pivot_base = gfp_zero;
+            for (size_t r = rank; r < rank + src_rank; r++)
+            {
+                cpu_swap_row(r, p_row[r]);
+                for (size_t i = rank; i < r; i++)
+                {
+                    cpu_addmul_row(r, i, gfprime - *at_base(r, p_col[i]), p_col[i]);
+                }
+                cpu_mul_row(r, gfp_inv(*at_base(r, p_col[r])), p_col[r]);
+                for (size_t i = rank; i < r; i++)
+                {
+                    cpu_addmul_row(i, r, gfprime - *at_base(i, p_col[r]), p_col[r]);
+                }
+                set4(pivot_base, p_col[r] - st_w, r - rank);
+            }
+
+            submat.deepcopy(*this, 0, st_w, nrows, ed_w);
+
+            dim3 block(THREAD_X, THREAD_Y);
+            dim3 grid((width - st_w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
+            gpu_elim_kernel<<<grid, block>>>(submat.data, submat.rowstride, at_base(0, st_w), rowstride, rank, src_rank, pivot_base, nrows, width - st_w, gfp_block_size);
+            cudaDeviceSynchronize();
+
+            rank += src_rank;
+
+            if (rank == nrows)
+            {
+                break;
+            }
+        }
+        return {rank, p_col, p_row};
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/include/gfp/gfp_header.cuh b/include/gfp/gfp_header.cuh
index 830479a..3470866 100644
--- a/include/gfp/gfp_header.cuh
+++ b/include/gfp/gfp_header.cuh
@@ -20,8 +20,31 @@ namespace gfp
         gfp_inv_table[0] = 0;
         gfp_inv_table[1] = 1;
         for (int i = 2; i < gfprime; ++i)
+        {
             gfp_inv_table[i] = (gfprime - gfprime / i) * gfp_inv_table[gfprime % i] % gfprime;
+        }
     }
+    gfp_t gfp_inv(gfp_t a)
+    {
+        return gfp_inv_table[a];
+    }
+
+    __host__ __device__ inline size_t offset4(size_t idx)
+    {
+        return idx << 2;
+    }
+
+    __host__ __device__ inline size_t get4(gfp_t src, size_t idx)
+    {
+        return (src >> offset4(idx)) & 0xF;
+    }
+
+    // 确保set8对应位置的值为0
+    __host__ __device__ inline void set4(gfp_t &dst, size_t src, size_t idx)
+    {
+        dst |= (gfp_t)src << offset4(idx);
+    }
+
 }
 
 #endif
\ No newline at end of file
diff --git a/include/gfp/gfp_mat.cuh b/include/gfp/gfp_mat.cuh
index 69c7fff..8252d03 100644
--- a/include/gfp/gfp_mat.cuh
+++ b/include/gfp/gfp_mat.cuh
@@ -9,6 +9,13 @@
 
 namespace gfp
 {
+    struct ElimResult
+    {
+        size_t rank;
+        vector<size_t> pivot;
+        vector<size_t> swap_row;
+    };
+
     class MatGFP
     {
     public:
@@ -76,6 +83,8 @@ namespace gfp
         {
             if (type == root)
             {
+                // cout << nrows << " " << ncols << endl;
+                // cout << *data << endl;
                 CUDA_CHECK(cudaFree(data));
             }
         }
@@ -99,6 +108,12 @@ namespace gfp
             }
         }
 
+        void deepcopy(const MatGFP &src, size_t begin_ri, size_t begin_wi, size_t end_rj, size_t end_wj)
+        {
+            assert(end_rj - begin_ri <= nrows && end_wj - begin_wi <= width);
+            CUDA_CHECK(cudaMemcpy2D(data, rowstride * sizeof(gfp_t), src.at_base(begin_ri, begin_wi), src.rowstride * sizeof(gfp_t), (end_wj - begin_wi) * sizeof(gfp_t), end_rj - begin_ri, cudaMemcpyDefault));
+        }
+
         // 生成随机最简化行阶梯矩阵 前rank_col中选择nrows个主元列
         void randomize(size_t rank_col, uint_fast32_t seed)
         {
@@ -209,8 +224,40 @@ namespace gfp
             return temp;
         }
 
-        // void cpu_swap_row(size_t r1, size_t r2);
-        // ElimResult gpu_elim();
+        void cpu_swap_row(size_t r1, size_t r2)
+        {
+            if (r1 == r2)
+            {
+                return;
+            }
+            gfp_t *p1 = at_base(r1, 0);
+            gfp_t *p2 = at_base(r2, 0);
+            for (size_t i = 0; i < width; i++)
+            {
+                gfp_t temp = p1[i];
+                p1[i] = p2[i];
+                p2[i] = temp;
+            }
+        }
+
+        void cpu_mul_row(size_t r, gfp_t mul, size_t begin_w = 0, size_t end_w = 0)
+        {
+            for (size_t i = begin_w; i < (end_w ? end_w : width); i++)
+            {
+                *at_base(r, i) = (*at_base(r, i) * mul) % gfprime;
+            }
+        }
+
+        void cpu_addmul_row(size_t r1, size_t r2, gfp_t mul, size_t begin_w = 0, size_t end_w = 0)
+        {
+            for (size_t i = begin_w; i < (end_w ? end_w : width); i++)
+            {
+                *at_base(r1, i) = (*at_base(r1, i) + *at_base(r2, i) * mul) % gfprime;
+            }
+        }
+
+        ElimResult gpu_elim();
+        ElimResult cpu_elim();
 
         friend ostream &operator<<(ostream &out, const MatGFP &m);
 
diff --git a/src/main.cu b/src/main.cu
index dfacffe..e67ed34 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -6,16 +6,19 @@
 
 using namespace gfp;
 
+bool test_gfp_elim(size_t rank, size_t rank_col, size_t nrows, size_t ncols, uint_fast32_t seed)
+{
+    MatGFP rdc(rank, ncols);
+    rdc.randomize(rank_col, seed);
+    MatGFP mix(nrows, rank);
+    mix.randomize(seed);
+    MatGFP a = mix * rdc;
+    ElimResult res = a.gpu_elim();
+    MatGFP win(a, 0, 0, res.rank, a.width);
+    return rdc == win;
+}
+
 int main()
 {
-    int m = 1000, k = 1000, n = 1000;
-    MatGFP a(m, k);
-    MatGFP b(k, n);
-    MatGFP c(m, n);
-    a.randomize(10);
-    b.randomize(10);
-    c.cpu_addmul(a, b);
-    MatGFP d(m, n);
-    d.gpu_mul(a, b);
-    cout << (c == d) << endl;
+    cout << test_gfp_elim(1234, 2345, 3456, 4567, 41921095) << endl;
 }
\ No newline at end of file