53 lines
1.5 KiB
Plaintext
53 lines
1.5 KiB
Plaintext
|
#ifndef MULTIPLICATION_CUH
|
||
|
#define MULTIPLICATION_CUH
|
||
|
|
||
|
#include "matrix.cuh"
|
||
|
#include "gf28.cuh"
|
||
|
|
||
|
// 处理32base列的所有行
|
||
|
__global__ void gpu_addmul_kernel(base_t *a, size_t a_pitch, base_t *b, size_t b_pitch, base_t *c, size_t c_pitch, size_t nrows, size_t width)
|
||
|
{
|
||
|
__shared__ __align__(8) base_t src[base_deg][THREAD_X];
|
||
|
size_t r = threadIdx.y;
|
||
|
size_t w = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if (w >= width)
|
||
|
{
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (r < nrows && w < width)
|
||
|
src[threadIdx.y][threadIdx.x] = *at_pitch(b, b_pitch, r, w);
|
||
|
else
|
||
|
src[threadIdx.y][threadIdx.x] = base_zero;
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
for (; r < nrows; r += base_deg)
|
||
|
{
|
||
|
base_t val = *at_pitch(a, a_pitch, r, 0);
|
||
|
base_t temp = base_zero;
|
||
|
for (size_t i = 0; i < base_deg; i++)
|
||
|
{
|
||
|
temp ^= mul_base(get8(val, i), src[i][threadIdx.x]);
|
||
|
}
|
||
|
*at_pitch(c, c_pitch, r, w) ^= temp;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__host__ void GF28Matrix::gpu_addmul(const GF28Matrix &a, const GF28Matrix &b, const GF28 &gf)
|
||
|
{
|
||
|
assert(a.ncols == b.nrows && a.nrows == nrows && b.ncols == ncols);
|
||
|
cudaMemcpyToSymbol(d_mul_table, gf.mul_table, (1 << base_deg) * (1 << base_deg) * sizeof(gf28_t));
|
||
|
for (size_t w = 0; w < a.width; w++)
|
||
|
{
|
||
|
dim3 block(THREAD_X, THREAD_Y);
|
||
|
dim3 grid((b.width - 1) / block.x + 1);
|
||
|
cudaDeviceSynchronize();
|
||
|
gpu_addmul_kernel<<<grid, block>>>(a.at_base(0, w), a.pitch, b.at_base(w * base_num, 0), b.pitch, data, pitch, nrows, width);
|
||
|
}
|
||
|
cudaDeviceSynchronize();
|
||
|
}
|
||
|
|
||
|
#endif
|