#ifndef GF2_MUL_CUH #define GF2_MUL_CUH #include "gf2_mat.cuh" namespace gf2 { __global__ void gpu_addmul_kernel(base_t *a, size_t a_rowstride, base_t *tb, size_t tb_rowstride, base_t *c, size_t c_rowstride, size_t ncols, size_t width, size_t nrows) { size_t w = blockIdx.x * blockDim.x + threadIdx.x; size_t r = blockIdx.y * blockDim.y + threadIdx.y; if (w >= width || r >= nrows) { return; } base_t val = *at_base(a, a_rowstride, r, 0); base_t temp = base_zero; for (size_t i = 0; i < gf2_table_num; i++) { temp ^= *at_base(tb, tb_rowstride, i * (1 << gf2_table_len) + (val & gf2_table_mask), w); val >>= gf2_table_len; } *at_base(c, c_rowstride, r, w) ^= temp; } __global__ void gpu_mktb_kernel(base_t *tb, size_t tb_rowstride, base_t *b, size_t b_rowstride, size_t tb_width) { size_t w = blockIdx.x * blockDim.x + threadIdx.x; size_t r = blockIdx.y * blockDim.y + threadIdx.y; if (w >= tb_width) { return; } base_t val = base_zero; base_t idx = r & gf2_table_mask; base_t st_row = (r >> gf2_table_len) * gf2_table_len; for (size_t i = 0; i < gf2_table_len; i++) { if (get(idx, i) != 0) { val ^= *at_base(b, b_rowstride, st_row + i, w); } } *at_base(tb, tb_rowstride, r, w) = val; } __host__ void MatGF2::gpu_addmul(const MatGF2 &a, const MatGF2 &b) { assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols); MatGF2 tb(gf2_table_num * (1 << gf2_table_len), b.ncols); progress::ProgressBar pb("GPU MULTIPLY", a.width); for (size_t w = 0; w < a.width; w++, pb.tick_display()) { size_t size = min(base_len, a.ncols - w * base_len); size_t tb_nrows = (size / gf2_table_len) * (1 << gf2_table_len) + (size % gf2_table_len == 0 ? 0 : 1 << (size % gf2_table_len)); dim3 block_tb(THREAD_X, THREAD_Y); dim3 grid_tb((b.width - 1) / block_tb.x + 1, (tb_nrows - 1) / block_tb.y + 1); gpu_mktb_kernel<<>>(tb.data, tb.rowstride, b.at_base(w * base_len, 0), b.rowstride, tb.width); cudaDeviceSynchronize(); dim3 block(THREAD_X, THREAD_Y); dim3 grid((b.width - 1) / block.x + 1, (nrows - 1) / block.y + 1); gpu_addmul_kernel<<>>(a.at_base(0, w), a.rowstride, tb.data, tb.rowstride, data, rowstride, size, width, nrows); cudaDeviceSynchronize(); } } __host__ MatGF2 gpu_mul(const MatGF2 &a, const MatGF2 &b) { assert(a.ncols == b.nrows); MatGF2 c(a.nrows, b.ncols); c.gpu_addmul(a, b); return c; } } #endif