237 lines
8.0 KiB
Plaintext
237 lines
8.0 KiB
Plaintext
|
#ifndef GFP_ELIM_CUH
|
||
|
#define GFP_ELIM_CUH
|
||
|
|
||
|
#include "gfp_mat.cuh"
|
||
|
|
||
|
static const size_t gfp_block_size = 8;
|
||
|
|
||
|
namespace gfp
|
||
|
{
|
||
|
ElimResult MatGFP::cpu_elim()
|
||
|
{
|
||
|
init_inv_table();
|
||
|
size_t rank = 0;
|
||
|
vector<size_t> pivot;
|
||
|
vector<size_t> swap_row;
|
||
|
for (size_t w = 0; w < width; w++)
|
||
|
{
|
||
|
if (*at_base(rank, w) != gfp_zero)
|
||
|
{
|
||
|
swap_row.push_back(rank);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
for (size_t r = rank + 1; r < nrows; r++)
|
||
|
{
|
||
|
if (*at_base(r, w) != gfp_zero)
|
||
|
{
|
||
|
swap_row.push_back(r);
|
||
|
cpu_swap_row(rank, r);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
pivot.push_back(w);
|
||
|
|
||
|
gfp_t inv = gfp_inv(*at_base(rank, w));
|
||
|
|
||
|
for (size_t wi = w; wi < width; wi++)
|
||
|
{
|
||
|
*at_base(rank, wi) = (*at_base(rank, wi) * inv) % gfprime;
|
||
|
}
|
||
|
|
||
|
for (size_t r = 0; r < nrows; r++)
|
||
|
{
|
||
|
if (r == rank)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
gfp_t mul = *at_base(r, w);
|
||
|
|
||
|
for (size_t wi = w; wi < width; wi++)
|
||
|
{
|
||
|
*at_base(r, wi) = (*at_base(r, wi) + (*at_base(rank, wi) * (gfprime - mul))) % gfprime;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
rank++;
|
||
|
if (rank == nrows)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return {rank, pivot, swap_row};
|
||
|
}
|
||
|
|
||
|
size_t cpu_elim_base(MatGFP &submat, base_t submat_nrows, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row)
|
||
|
{
|
||
|
size_t rank = 0;
|
||
|
size_t pivot[gfp_block_size];
|
||
|
size_t next[gfp_block_size];
|
||
|
for (size_t pivot_col = 0; pivot_col < gfp_block_size; pivot_col++)
|
||
|
{
|
||
|
for (size_t r = rank; r < submat_nrows; r++)
|
||
|
{
|
||
|
for (size_t i = 0; i < rank; i++)
|
||
|
{
|
||
|
if (next[i] == r)
|
||
|
{
|
||
|
gfp_t mul = *submat.at_base(r, pivot[i]);
|
||
|
for (size_t j = pivot[i] + 1; j < gfp_block_size; j++)
|
||
|
{
|
||
|
*submat.at_base(r, j) = (*submat.at_base(r, j) + *submat.at_base(i, j) * (gfprime - mul)) % gfprime;
|
||
|
}
|
||
|
next[i]++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (*submat.at_base(r, pivot_col) != 0)
|
||
|
{
|
||
|
p_col.push_back(w + pivot_col);
|
||
|
p_row.push_back(st_r + r);
|
||
|
if (r != rank)
|
||
|
{
|
||
|
submat.cpu_swap_row(rank, r);
|
||
|
}
|
||
|
gfp_t inv = gfp_inv(*submat.at_base(rank, pivot_col));
|
||
|
for (size_t j = pivot_col + 1; j < gfp_block_size; j++)
|
||
|
{
|
||
|
*submat.at_base(rank, j) = (inv * (*submat.at_base(rank, j))) % gfprime;
|
||
|
}
|
||
|
pivot[rank] = pivot_col;
|
||
|
next[rank] = rank + 1;
|
||
|
rank++;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return rank;
|
||
|
}
|
||
|
|
||
|
__global__ void gpu_elim_kernel(gfp_t *__restrict__ idx, const size_t idx_rs, gfp_t *__restrict__ data, const size_t rs, const size_t st_src, const size_t rank, const gfp_t pivot_base, const size_t nrows, const size_t ncols, const size_t nsteps)
|
||
|
{
|
||
|
|
||
|
const unsigned int bx = blockIdx.x;
|
||
|
const unsigned int by = blockIdx.y;
|
||
|
const unsigned int tx = threadIdx.x;
|
||
|
const unsigned int ty = threadIdx.y;
|
||
|
const unsigned int tid = ty * blockDim.x + tx;
|
||
|
|
||
|
__shared__ gfp_t s_idx[StepSize][BlockRow];
|
||
|
__shared__ gfp_t s_src[StepSize][BlockCol];
|
||
|
|
||
|
gfp_t tmp_c[BlockRow / THREAD_Y][BlockCol / THREAD_X] = {0};
|
||
|
for (int k = tid; k < StepSize * BlockRow; k += blockDim.x * blockDim.y)
|
||
|
{
|
||
|
const int a_r = k / StepSize;
|
||
|
const int a_c = k % StepSize;
|
||
|
s_idx[a_c][a_r] = (by * BlockRow + a_r < nrows && a_c < nsteps) ? *at_base(idx, idx_rs, by * BlockRow + a_r, a_c) : 0;
|
||
|
const int b_r = k / BlockCol;
|
||
|
const int b_c = k % BlockCol;
|
||
|
s_src[b_r][b_c] = (b_r < rank && bx * BlockCol + b_c < ncols) ? *at_base(data, rs, st_src + b_r, bx * BlockCol + b_c) : 0;
|
||
|
s_src[b_r][b_c] = s_src[b_r][b_c] ? gfprime - s_src[b_r][b_c] : 0;
|
||
|
}
|
||
|
__syncthreads();
|
||
|
// if (bx == 0 && by == 0 && tid == 0)
|
||
|
// {
|
||
|
// for (int i = 0; i < StepSize; i++)
|
||
|
// {
|
||
|
// for (int j = 0; j < BlockRow; j++)
|
||
|
// {
|
||
|
// printf("%05d ", s_idx[i][j]);
|
||
|
// }
|
||
|
// printf("\n");
|
||
|
// }
|
||
|
// for (int i = 0; i < StepSize; i++)
|
||
|
// {
|
||
|
// for (int j = 0; j < BlockCol; j++)
|
||
|
// {
|
||
|
// printf("%05d ", s_src[i][j]);
|
||
|
// }
|
||
|
// printf("\n");
|
||
|
// }
|
||
|
// }
|
||
|
for (int k = 0; k < rank; k++)
|
||
|
{
|
||
|
for (int j = 0; j < BlockRow / THREAD_Y; j++)
|
||
|
{
|
||
|
for (int i = 0; i < BlockCol / THREAD_X; i++)
|
||
|
{
|
||
|
tmp_c[j][i] += (s_idx[get4(pivot_base, k)][j * THREAD_Y + ty] * s_src[k][i * THREAD_X + tx]) % gfprime;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (int j = 0; j < BlockRow / THREAD_Y; j++)
|
||
|
{
|
||
|
for (int i = 0; i < BlockCol / THREAD_X; i++)
|
||
|
{
|
||
|
if (by * BlockRow + j * THREAD_Y + ty < nrows && (by * BlockRow + j * THREAD_Y + ty < st_src || by * BlockRow + j * THREAD_Y + ty >= st_src + rank) && bx * BlockCol + i * THREAD_X + tx < ncols)
|
||
|
{
|
||
|
gfp_t *dst = at_base(data, rs, by * BlockRow + j * THREAD_Y + ty, bx * BlockCol + i * THREAD_X + tx);
|
||
|
*dst = (*dst + tmp_c[j][i]) % gfprime;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__host__ ElimResult MatGFP::gpu_elim()
|
||
|
{
|
||
|
init_inv_table();
|
||
|
|
||
|
MatGFP submat(nrows, gfp_block_size);
|
||
|
|
||
|
size_t rank = 0;
|
||
|
vector<size_t> p_col, p_row;
|
||
|
|
||
|
progress::ProgressBar pb("GPU ELIMINATE", (width - 1) / gfp_block_size + 1);
|
||
|
for (size_t w = 0; w < (width - 1) / gfp_block_size + 1; w++, pb.tick_display())
|
||
|
{
|
||
|
size_t st_w = w * gfp_block_size;
|
||
|
size_t ed_w = min(st_w + gfp_block_size, width);
|
||
|
|
||
|
submat.deepcopy(*this, rank, st_w, nrows, ed_w);
|
||
|
|
||
|
size_t src_rank = cpu_elim_base(submat, nrows - rank, rank, st_w, p_col, p_row);
|
||
|
|
||
|
if (src_rank == 0)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
gfp_t pivot_base = gfp_zero;
|
||
|
for (size_t r = rank; r < rank + src_rank; r++)
|
||
|
{
|
||
|
cpu_swap_row(r, p_row[r]);
|
||
|
for (size_t i = rank; i < r; i++)
|
||
|
{
|
||
|
cpu_addmul_row(r, i, gfprime - *at_base(r, p_col[i]), p_col[i]);
|
||
|
}
|
||
|
cpu_mul_row(r, gfp_inv(*at_base(r, p_col[r])), p_col[r]);
|
||
|
for (size_t i = rank; i < r; i++)
|
||
|
{
|
||
|
cpu_addmul_row(i, r, gfprime - *at_base(i, p_col[r]), p_col[r]);
|
||
|
}
|
||
|
set4(pivot_base, p_col[r] - st_w, r - rank);
|
||
|
}
|
||
|
|
||
|
submat.deepcopy(*this, 0, st_w, nrows, ed_w);
|
||
|
|
||
|
dim3 block(THREAD_X, THREAD_Y);
|
||
|
dim3 grid((width - st_w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
||
|
gpu_elim_kernel<<<grid, block>>>(submat.data, submat.rowstride, at_base(0, st_w), rowstride, rank, src_rank, pivot_base, nrows, width - st_w, gfp_block_size);
|
||
|
cudaDeviceSynchronize();
|
||
|
|
||
|
rank += src_rank;
|
||
|
|
||
|
if (rank == nrows)
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return {rank, p_col, p_row};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif
|