207 lines
6.7 KiB
Plaintext
207 lines
6.7 KiB
Plaintext
#ifndef GF2_ELIM_CUH
|
|
#define GF2_ELIM_CUH
|
|
|
|
#include "gf2_mat.cuh"
|
|
|
|
namespace gf2
|
|
{
|
|
size_t cpu_elim_base(base_t *base_col, base_t base_col_len, size_t st_r, size_t w, vector<size_t> &p_col, vector<size_t> &p_row)
|
|
{
|
|
size_t rank = 0;
|
|
size_t pivot[gf2_num];
|
|
size_t next[gf2_num];
|
|
for (size_t pivot_col = 0; pivot_col < gf2_num; pivot_col++)
|
|
{
|
|
for (size_t r = rank; r < base_col_len; r++)
|
|
{
|
|
for (size_t i = 0; i < rank; i++)
|
|
{
|
|
if (next[i] == r)
|
|
{
|
|
if (get(base_col[r], pivot[i]) != 0)
|
|
{
|
|
base_col[r] ^= concat(base_zero, pivot[i] + 1, base_col[i]);
|
|
}
|
|
next[i]++;
|
|
}
|
|
}
|
|
|
|
if (get(base_col[r], pivot_col) != 0)
|
|
{
|
|
p_col.push_back(w * gf2_num + pivot_col);
|
|
p_row.push_back(st_r + r);
|
|
if (r != rank)
|
|
{
|
|
base_t temp = base_col[rank];
|
|
base_col[rank] = base_col[r];
|
|
base_col[r] = temp;
|
|
}
|
|
pivot[rank] = pivot_col;
|
|
next[rank] = rank + 1;
|
|
rank++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return rank;
|
|
}
|
|
|
|
__managed__ uint32_t m_pivot[gf2_num];
|
|
|
|
__global__ void gpu_mksrc_kernel(base_t *src, size_t s_rowstride, base_t *base_col, size_t rank, uint32_t m_pivot[gf2_num], size_t width)
|
|
{
|
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (w >= width)
|
|
{
|
|
return;
|
|
}
|
|
base_t temp[gf2_num];
|
|
for (size_t r = 0; r < rank; r++)
|
|
{
|
|
temp[r] = *at_base(src, s_rowstride, r, w);
|
|
}
|
|
for (size_t r = 0; r < rank; r++)
|
|
{
|
|
for (size_t i = 0; i < r; i++)
|
|
{
|
|
if (get(base_col[r], m_pivot[i]) != 0)
|
|
{
|
|
temp[r] ^= temp[i];
|
|
}
|
|
}
|
|
}
|
|
for (size_t rr = 1; rr < rank; rr++)
|
|
{
|
|
size_t r = rank - 1 - rr;
|
|
for (size_t i = r + 1; i < rank; i++)
|
|
{
|
|
if (get(base_col[r], m_pivot[i]) != 0)
|
|
{
|
|
temp[r] ^= temp[i];
|
|
}
|
|
}
|
|
}
|
|
for (size_t r = 0; r < rank; r++)
|
|
{
|
|
*at_base(src, s_rowstride, r, w) = temp[r];
|
|
}
|
|
}
|
|
|
|
__global__ void gpu_elim_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *b, size_t b_rowstride, size_t tb_width)
|
|
{
|
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (w >= tb_width)
|
|
{
|
|
return;
|
|
}
|
|
|
|
base_t val = base_zero;
|
|
base_t idx = r & gf2_table_mask;
|
|
base_t st_row = (r >> gf2_table_len) * gf2_table_len;
|
|
|
|
for (size_t i = 0; i < gf2_table_len; i++)
|
|
{
|
|
if (get(idx, i) != 0)
|
|
{
|
|
val ^= *at_base(b, b_rowstride, st_row + i, w);
|
|
}
|
|
}
|
|
*at_base(tb, tb_rowstride, r, w) = val;
|
|
}
|
|
|
|
__global__ void gpu_elim_kernel(base_t *idx, base_t *tb, size_t tb_rowstride, base_t *data, size_t rowstride, size_t rank, uint32_t m_pivot[gf2_num], size_t st_skip, size_t width, size_t nrows)
|
|
{
|
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t r = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (w >= width || r >= nrows || (r >= st_skip && r < st_skip + rank))
|
|
{
|
|
return;
|
|
}
|
|
|
|
base_t val = idx[r];
|
|
base_t temp = base_zero;
|
|
for (size_t i = 0; i < gf2_table_num; i++)
|
|
{
|
|
base_t loc = base_zero;
|
|
for (size_t j = 0; (j < gf2_table_len) && (i * gf2_table_len + j < rank); j++)
|
|
{
|
|
loc |= get(val, m_pivot[i * gf2_table_len + j]) << j;
|
|
}
|
|
temp ^= *at_base(tb, tb_rowstride, i * (1 << gf2_table_len) + loc, w);
|
|
}
|
|
*at_base(data, rowstride, r, w) ^= temp;
|
|
}
|
|
|
|
// __managed__ base_t spL[gf2_num];
|
|
|
|
__host__ ElimResult MatGF2::gpu_elim()
|
|
{
|
|
MatGF2 tb(gf2_table_num * (1 << gf2_table_len), ncols);
|
|
|
|
base_t *base_col;
|
|
cudaMallocManaged(&base_col, nrows * sizeof(base_t));
|
|
base_t *idx;
|
|
cudaMallocManaged(&idx, nrows * sizeof(base_t));
|
|
|
|
size_t rank = 0;
|
|
vector<size_t> p_col, p_row;
|
|
|
|
progress::ProgressBar pb("GPU ELIMINATE", width);
|
|
for (size_t w = 0; w < width; w++, pb.tick_display())
|
|
{
|
|
CUDA_CHECK(cudaMemcpy2D(base_col + rank, sizeof(base_t), at_base(rank, w), rowstride * sizeof(base_t), sizeof(base_t), nrows - rank, cudaMemcpyDefault));
|
|
|
|
size_t src_rank = cpu_elim_base(base_col + rank, nrows - rank, rank, w, p_col, p_row);
|
|
|
|
if (src_rank == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
for (size_t i = 0; i < src_rank; i++)
|
|
{
|
|
cpu_swap_row(rank + i, p_row[rank + i]);
|
|
}
|
|
|
|
for (size_t r = 0; r < src_rank; r++)
|
|
{
|
|
size_t loc = (p_col[rank + r] - w * gf2_num);
|
|
m_pivot[r] = loc;
|
|
}
|
|
|
|
dim3 block_src(THREAD_X);
|
|
dim3 grid_src((width - w - 1) / block_src.x + 1);
|
|
gpu_mksrc_kernel<<<grid_src, block_src>>>(at_base(rank, w), rowstride, base_col + rank, src_rank, m_pivot, width);
|
|
cudaDeviceSynchronize();
|
|
|
|
size_t tb_nrows = (src_rank / gf2_table_len) * (1 << gf2_table_len) + (src_rank % gf2_table_len == 0 ? 0 : 1 << (src_rank % gf2_table_len));
|
|
|
|
dim3 block_tb(THREAD_X, THREAD_Y);
|
|
dim3 grid_tb((width - w - 1) / block_tb.x + 1, (tb_nrows - 1) / block_tb.y + 1);
|
|
gpu_elim_mktb_kernel<<<grid_tb, block_tb>>>(tb.data, tb.rowstride, at_base(rank, w), rowstride, tb.width);
|
|
cudaDeviceSynchronize();
|
|
|
|
CUDA_CHECK(cudaMemcpy2D(idx, sizeof(base_t), at_base(0, w), rowstride * sizeof(base_t), sizeof(base_t), nrows, cudaMemcpyDefault));
|
|
|
|
dim3 block(THREAD_X, THREAD_Y);
|
|
dim3 grid((width - w - 1) / block.x + 1, (nrows - 1) / block.y + 1);
|
|
gpu_elim_kernel<<<grid, block>>>(idx, tb.data, tb.rowstride, at_base(0, w), rowstride, src_rank, m_pivot, rank, width - w, nrows);
|
|
cudaDeviceSynchronize();
|
|
|
|
rank += src_rank;
|
|
|
|
if (rank == nrows)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
cudaFree(base_col);
|
|
cudaFree(idx);
|
|
return {rank, p_col, p_row};
|
|
}
|
|
}
|
|
|
|
#endif |