Skip to content

Vector pa kernels #4861

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 46 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
68ae66c
vecmass PA setup
camierjs May 9, 2025
93dbe14
PAVectorMassApply3D with Eval3d
camierjs May 9, 2025
3949dec
PAVectorMassApply2D with Eval2d
camierjs May 10, 2025
d2e2834
VectorMassIntegrator PA specializations
camierjs May 10, 2025
b08d12b
VectorMassIntegrator 2D with MatrixFunctionCoefficients
camierjs May 12, 2025
0892f19
VectorMassIntegrator 3D MatrixFunctionCoefficient
camierjs May 12, 2025
1c04593
VectorMassIntegrator cleanup
camierjs May 12, 2025
72d28e0
VectorDiffusionIntegrator AssemblePA vector coefficients
camierjs May 12, 2025
ba4befc
VectorDiffusionIntegrator wip Matrix coefficient
camierjs May 12, 2025
c707a4d
jj=ii
camierjs May 12, 2025
3086dae
WIP PAVectorDiffusionApply2D mcoeff
camierjs May 13, 2025
5827732
WIP 0 1
camierjs May 13, 2025
7f7e5f6
All c, d
camierjs May 13, 2025
031cd67
VectorDiffusionIntegrator 2D mcoeff
camierjs May 13, 2025
2a3c1c5
VectorDiffusionIntegrator 2D cleanup
camierjs May 13, 2025
3c2f98b
test_vector_pa_integrator all tests passed
camierjs May 13, 2025
3fb13a3
WIP 3D vector diffusion with sym mcoeff
camierjs May 13, 2025
d27fa84
Pre AssemblePA 2D vector diffusion
camierjs May 13, 2025
580dc6f
Simplify VectorDiffusionIntegrator::AssemblePA
camierjs May 13, 2025
6a04555
SmemPAVectorDiffusionApply2D
camierjs May 13, 2025
795a121
SmemPAVectorDiffusionApply3D
camierjs May 13, 2025
686feff
VectorDiffusionAddMultPA registered
camierjs May 13, 2025
51c28ad
Fix grad ii vs. jj matrix coeff
camierjs May 14, 2025
eb68434
Simplify regs_t kernels types on CPU
May 15, 2025
e41c782
Use explicit type names for registers
May 15, 2025
62dd1ae
Cleanup
May 16, 2025
ea25a9b
Remove PAVectorDiffusionApply kernels, fuse SDIM != DIM diffusion vec…
camierjs May 17, 2025
3b88332
Merge branch 'master' into vector-pa-kernels
camierjs May 17, 2025
6bb5f8a
Change pa data layout
camierjs May 18, 2025
190c4a0
WIP SmemPAVectorMassApply2D layouts
camierjs May 18, 2025
2ead488
Pre SmemPAVectorMassApply2D cleanup
camierjs May 18, 2025
82c2e01
Cleanup
camierjs May 18, 2025
7a37cc2
MSVC header guards
camierjs May 18, 2025
c50905e
MAX_T1D for WIN32
camierjs May 18, 2025
f3443f9
MSVC fix
camierjs May 18, 2025
b3e9ddd
WIN32 cmath
camierjs May 18, 2025
f9972a5
Simplify back SmemPAVectorMassApply2D
camierjs May 19, 2025
716c201
GPU vector kernels fix
camierjs May 19, 2025
0742475
test_pa_kernels tags
camierjs May 19, 2025
189c201
Fix shadowing
camierjs May 19, 2025
7dba460
Move kernel methods to fem/kernels.hpp
camierjs May 19, 2025
acbe258
CMake remove header
camierjs May 19, 2025
6145305
Merge branch 'master' into vector-pa-kernels
camierjs May 28, 2025
2e70688
Merge branch 'master' into vector-pa-kernels
camierjs Jun 13, 2025
ba70885
Merge branch 'master' into vector-pa-kernels
camierjs Jun 16, 2025
319b870
Use future namespace for tensor
camierjs Jun 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use explicit type names for registers
  • Loading branch information
camierjs authored and camierjs committed May 15, 2025
commit e41c782974d5f7c60b7255311e3b1183bf641f41
4 changes: 2 additions & 2 deletions fem/integ/bilininteg_vecdiffusion_pa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ void VectorDiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
dim = mesh->Dimension();
sdim = mesh->SpaceDimension();
const int nq = ir->GetNPoints();
const int nd = el.GetDof();
// const int nd = el.GetDof();
const int dims = el.GetDim();

dbg("dim:{} vdim:{} fes.VDim():{} sdim:{} nq:{} nd:{} dims:{}",
Expand Down Expand Up @@ -93,7 +93,7 @@ void VectorDiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)

const int pa_size = dims*dims;
coeff_vdim = coeff.GetVDim();
const bool const_coeff = coeff.Size() == 1;
// const bool const_coeff = coeff.Size() == 1;
assert(!const_coeff);
dbg("\x1b[33mpa_size:{}", pa_size);
dbg("\x1b[33mconst_coeff:{}", const_coeff);
Expand Down
4 changes: 2 additions & 2 deletions fem/integ/bilininteg_vecdiffusion_pa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ void SmemPAVectorDiffusionApply2D(const int NE,
constexpr int MQ1 = T_Q1D > 0 ? kernels::internal::SetMaxOf(T_Q1D) : 32;

MFEM_SHARED real_t sB[MD1][MQ1], sG[MD1][MQ1], smem[MQ1][MQ1];
kernels::internal::regs_t<VDIM, DIM, MQ1, MQ1> r0, r1;
kernels::internal::vd_regs2d_t<VDIM, DIM, MQ1> r0, r1;
kernels::internal::LoadMatrix(D1D, Q1D, b, sB);
kernels::internal::LoadMatrix(D1D, Q1D, g, sG);

Expand Down Expand Up @@ -275,7 +275,7 @@ void SmemPAVectorDiffusionApply3D(const int NE,
constexpr int MQ1 = T_Q1D > 0 ? kernels::internal::SetMaxOf(T_Q1D) : 32;

MFEM_SHARED real_t sB[MD1][MQ1], sG[MD1][MQ1], smem[MQ1][MQ1];
kernels::internal::regs_t<VDIM, DIM, MQ1, MQ1, MQ1> r0, r1;
kernels::internal::vd_regs3d_t<VDIM, DIM, MQ1> r0, r1;
kernels::internal::LoadMatrix(D1D, Q1D, b, sB);
kernels::internal::LoadMatrix(D1D, Q1D, g, sG);

Expand Down
4 changes: 2 additions & 2 deletions fem/integ/bilininteg_vecmass_pa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ void VectorMassIntegrator::AssemblePA(const FiniteElementSpace &fes)
Mesh *mesh = fes.GetMesh();
const FiniteElement &el = *fes.GetTypicalFE();
ElementTransformation &Trans = *mesh->GetTypicalElementTransformation();
const int spaceDim = Trans.GetSpaceDim();
// const int spaceDim = Trans.GetSpaceDim();

const int nd = el.GetDof();
// const int nd = el.GetDof();
dbg("nd: {} spaceDim: {}", nd, spaceDim);
dbg("vdim: {} spaceDim: {}", vdim, spaceDim);

Expand Down
8 changes: 4 additions & 4 deletions fem/integ/bilininteg_vecmass_pa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ void PAVectorMassApply2D(const int NE,
MFEM_SHARED real_t sB[MD1][MQ1];
MFEM_SHARED real_t smem[MQ1][MQ1];

kernels::internal::regs_t<VDIM, 1, MQ1, MQ1> r0, r1;
kernels::internal::vd_regs2d_t<VDIM, 1, MQ1> r0, r1;
kernels::internal::LoadMatrix(D1D, Q1D, b, sB);
kernels::internal::LoadDofs2d(e, D1D, XE, r0);
kernels::internal::Eval2d(D1D, Q1D, smem, sB, r0, r1);
Expand Down Expand Up @@ -129,7 +129,7 @@ static void PAVectorMassApply3D(const int NE,
MFEM_SHARED real_t sB[MD1][MQ1];
MFEM_SHARED real_t smem[MQ1][MQ1];

kernels::internal::regs_t<VDIM, 1, MQ1, MQ1, MQ1> r0, r1;
kernels::internal::vd_regs3d_t<VDIM, 1, MQ1> r0, r1;
kernels::internal::LoadMatrix(D1D, Q1D, b, sB);
kernels::internal::LoadDofs3d(e, D1D, XE, r0);
kernels::internal::Eval3d(D1D, Q1D, smem, sB, r0, r1);
Expand Down Expand Up @@ -197,8 +197,8 @@ VectorMassIntegrator::VectorMassAddMultPAType
VectorMassIntegrator::VectorMassAddMultPA::Fallback(int dim, int, int)
{
MFEM_VERIFY(dim != 1, "Unsupported 1D kernel");
if (dim == 2) { return internal::PAVectorMassApply2D; }
else if (dim == 3) { return internal::PAVectorMassApply3D; }
if (dim == 2) { return internal::PAVectorMassApply2D<1,1>; }
else if (dim == 3) { return internal::PAVectorMassApply3D<1,1>; }
else { MFEM_ABORT(""); }
}

Expand Down
110 changes: 61 additions & 49 deletions fem/integ/kernels_regs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,32 @@ namespace internal
///////////////////////////////////////////////////////////////////////////////
#if ((defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)) || \
(defined(MFEM_USE_HIP) && defined(__HIP_DEVICE_COMPILE__)))
template <int N>
using regs2d_t = mfem::internal::tensor<real_t, 0, 0>;
template <int MQ>
using regs2d_t = mfem::internal::tensor<real_t, 0, 0>;

template <int N>
using regs3d_t = mfem::internal::tensor<real_t, N, 0, 0>;
template <int VDIM, int DIM, int MQ>
using vd_regs2d_t = mfem::internal::tensor<real_t, VDIM, DIM, 0, 0>;

template <int MQ>
using regs3d_t = mfem::internal::tensor<real_t, MQ, 0, 0>;

template <int VDIM, int DIM, int N>
using regs5d_t = mfem::internal::tensor<real_t, VDIM, DIM, N, 0, 0>;
template <int VDIM, int DIM, int MQ>
using vd_regs3d_t = mfem::internal::tensor<real_t, VDIM, DIM, MQ, 0, 0>;

// on GPU, SetMaxOf is a no-op
constexpr int SetMaxOf(int n) { return n; }
#else
template <int VDIM, int DIM, int... Ns>
using regs_t = mfem::internal::tensor<real_t, VDIM, DIM, Ns...>;
template <int MQ>
using regs2d_t = mfem::internal::tensor<real_t, MQ, MQ>;

template <int VDIM, int DIM, int MQ>
using vd_regs2d_t = mfem::internal::tensor<real_t, VDIM, DIM, MQ, MQ>;

template <int MQ>
using regs3d_t = mfem::internal::tensor<real_t, MQ, MQ, MQ>;

template <int VDIM, int DIM, int MQ>
using vd_regs3d_t = mfem::internal::tensor<real_t, VDIM, DIM, MQ, MQ, MQ>;

// on CPU, get next multiple of 4, allowing better alignements
template <int N>
Expand Down Expand Up @@ -73,7 +85,7 @@ void LoadMatrix(const int d1d, const int q1d,
template <int VDIM, int DIM, int MQ1 = 0>
inline MFEM_HOST_DEVICE void LoadDofs2d(const int e, const int d1d,
const DeviceTensor<4, const real_t> &X,
regs_t<VDIM, DIM, MQ1, MQ1> &Y)
vd_regs2d_t<VDIM, DIM, MQ1> &Y)
{
for (int c = 0; c < VDIM; ++c)
{
Expand All @@ -94,7 +106,7 @@ template <int VDIM, int DIM, int MQ1 = 0>
inline MFEM_HOST_DEVICE void LoadDofs2dOneComponent(const int e, const int c,
const int d1d,
const DeviceTensor<4, const real_t> &X,
regs_t<VDIM, DIM, MQ1, MQ1> &Y)
vd_regs2d_t<VDIM, DIM, MQ1> &Y)
{
MFEM_FOREACH_THREAD(dy, y, d1d)
{
Expand All @@ -113,8 +125,8 @@ template <bool Transpose, int MQ1>
inline MFEM_HOST_DEVICE void ContractX2d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (*B)[MQ1],
const regs_t<MQ1, MQ1> &X,
regs_t<MQ1, MQ1> &Y)
const regs2d_t<MQ1> &X,
regs2d_t<MQ1> &Y)
{
MFEM_FOREACH_THREAD(y, y, d1d)
{
Expand Down Expand Up @@ -143,8 +155,8 @@ template <bool Transpose, int MQ1>
inline MFEM_HOST_DEVICE void ContractY2d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (*B)[MQ1],
const regs_t<MQ1, MQ1> &X,
regs_t<MQ1, MQ1> &Y)
const regs2d_t<MQ1> &X,
regs2d_t<MQ1> &Y)
{
MFEM_FOREACH_THREAD(y, y, (Transpose ? q1d : d1d))
{
Expand All @@ -168,8 +180,8 @@ inline MFEM_HOST_DEVICE void ContractY2d(const int d1d, const int q1d,

template <int MQ1>
inline MFEM_HOST_DEVICE void Copy2d(const int q1d,
const regs_t<MQ1, MQ1> &X,
regs_t<MQ1, MQ1> &Y)
const regs2d_t<MQ1> &X,
regs2d_t<MQ1> &Y)
{
MFEM_FOREACH_THREAD(y, y, q1d)
{
Expand All @@ -182,8 +194,8 @@ inline MFEM_HOST_DEVICE void Contract2d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (*Bx)[MQ1],
const real_t (*By)[MQ1],
regs_t<MQ1, MQ1> &X,
regs_t<MQ1, MQ1> &Y)
regs2d_t<MQ1> &X,
regs2d_t<MQ1> &Y)
{
if (!Transpose)
{
Expand All @@ -204,22 +216,22 @@ template <int VDIM, int DIM, int MD1, int MQ1, bool Transpose = false>
inline MFEM_HOST_DEVICE void Eval2d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1> &Y)
vd_regs2d_t<VDIM, DIM, MQ1> &X,
vd_regs2d_t<VDIM, DIM, MQ1> &Y)
{
static_assert(DIM == 1, "DIM must be 1");
for (int c = 0; c < VDIM; c++)
{
Contract2d<Transpose>(d1d, q1d, smem, B, B, X[c][0], Y[c][0]);
Contract2d<Transpose, MQ1>(d1d, q1d, smem, B, B, X[c][0], Y[c][0]);
}
}

template <int VDIM, int DIM, int MD1, int MQ1>
inline MFEM_HOST_DEVICE void EvalTranspose2d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1> &Y)
vd_regs2d_t<VDIM, DIM, MQ1> &X,
vd_regs2d_t<VDIM, DIM, MQ1> &Y)
{
Eval2d<VDIM, DIM, MD1, MQ1, true>(d1d, q1d, smem, B, X, Y);
}
Expand All @@ -230,8 +242,8 @@ inline MFEM_HOST_DEVICE void Grad2d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
const real_t (&G)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1> &Y,
vd_regs2d_t<VDIM, DIM, MQ1> &X,
vd_regs2d_t<VDIM, DIM, MQ1> &Y,
const int k = -1)
{
for (int c = (k < 0 ? 0 : k); c < (k < 0 ? VDIM : k + 1); c++)
Expand All @@ -250,8 +262,8 @@ inline MFEM_HOST_DEVICE void GradTranspose2d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
const real_t (&G)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1> &Y,
vd_regs2d_t<VDIM, DIM, MQ1> &X,
vd_regs2d_t<VDIM, DIM, MQ1> &Y,
const int k = -1)
{
constexpr bool Transpose = true;
Expand All @@ -261,7 +273,7 @@ inline MFEM_HOST_DEVICE void GradTranspose2d(const int d1d, const int q1d,
///////////////////////////////////////////////////////////////////////////////
template <int VDIM, int DIM, int MQ1 = 0>
inline MFEM_HOST_DEVICE void WriteDofs2d(const int e, const int d1d,
regs_t<VDIM, DIM, MQ1, MQ1> &X,
vd_regs2d_t<VDIM, DIM, MQ1> &X,
const DeviceTensor<4, real_t> &Y)
{
MFEM_FOREACH_THREAD(dy, y, d1d)
Expand All @@ -282,7 +294,7 @@ template <int VDIM, int DIM, int MQ1 = 0>
inline MFEM_HOST_DEVICE
void WriteDofs2dOneComponent(const int e, const int i, const int j,
const int d1d,
regs_t<VDIM, DIM, MQ1, MQ1> &X,
vd_regs2d_t<VDIM, DIM, MQ1> &X,
const DeviceTensor<4, real_t> &Y)
{
MFEM_FOREACH_THREAD(dy, y, d1d)
Expand All @@ -301,7 +313,7 @@ template <int VDIM, int DIM, int MQ1>
inline MFEM_HOST_DEVICE
void LoadDofs3d(const int e, const int d1d,
const DeviceTensor<5, const real_t> &X,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &Y)
vd_regs3d_t<VDIM, DIM, MQ1> &Y)
{
for (int c = 0; c < VDIM; ++c)
{
Expand All @@ -326,7 +338,7 @@ inline MFEM_HOST_DEVICE
void LoadDofs3dOneComponent(const int e, const int c,
const int d1d,
const DeviceTensor<5, const real_t> &X,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &Y)
vd_regs3d_t<VDIM, DIM, MQ1> &Y)
{
for (int dz = 0; dz < d1d; ++dz)
{
Expand All @@ -348,8 +360,8 @@ template <bool Transpose, int MQ1>
inline MFEM_HOST_DEVICE void ContractX3d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (*B)[MQ1],
const regs_t<MQ1, MQ1, MQ1> &X,
regs_t<MQ1, MQ1, MQ1> &Y)
const regs3d_t<MQ1> &X,
regs3d_t<MQ1> &Y)
{
for (int z = 0; z < d1d; ++z)
{
Expand Down Expand Up @@ -382,8 +394,8 @@ template <bool Transpose, int MQ1>
inline MFEM_HOST_DEVICE void ContractY3d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (*B)[MQ1],
const regs_t<MQ1, MQ1, MQ1> &X,
regs_t<MQ1, MQ1, MQ1> &Y)
const regs3d_t<MQ1> &X,
regs3d_t<MQ1> &Y)
{
for (int z = 0; z < d1d; ++z)
{
Expand Down Expand Up @@ -412,8 +424,8 @@ inline MFEM_HOST_DEVICE void ContractY3d(const int d1d, const int q1d,
template <bool Transpose, int MQ1>
inline MFEM_HOST_DEVICE void ContractZ3d(const int d1d, const int q1d,
const real_t (*B)[MQ1],
const regs_t<MQ1, MQ1, MQ1> &X,
regs_t<MQ1, MQ1, MQ1> &Y)
const regs3d_t<MQ1> &X,
regs3d_t<MQ1> &Y)
{
for (int z = 0; z < (Transpose ? d1d : q1d); ++z)
{
Expand All @@ -438,8 +450,8 @@ inline MFEM_HOST_DEVICE void Contract3d(const int d1d, const int q1d,
const real_t (*Bx)[MQ1],
const real_t (*By)[MQ1],
const real_t (*Bz)[MQ1],
regs_t<MQ1, MQ1, MQ1> &X,
regs_t<MQ1, MQ1, MQ1> &Y)
regs3d_t<MQ1> &X,
regs3d_t<MQ1> &Y)
{
if (!Transpose)
{
Expand All @@ -460,8 +472,8 @@ template <int VDIM, int DIM, int MD1, int MQ1, bool Transpose = false>
inline MFEM_HOST_DEVICE void Eval3d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &Y)
vd_regs3d_t<VDIM, DIM, MQ1> &X,
vd_regs3d_t<VDIM, DIM, MQ1> &Y)
{
static_assert(DIM == 1, "DIM must be 1");
for (int c = 0; c < VDIM; c++)
Expand All @@ -474,8 +486,8 @@ template <int VDIM, int DIM, int MD1, int MQ1>
inline MFEM_HOST_DEVICE void EvalTranspose3d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &Y)
vd_regs3d_t<VDIM, DIM, MQ1> &X,
vd_regs3d_t<VDIM, DIM, MQ1> &Y)
{
Eval3d<VDIM, DIM, MD1, MQ1, true>(d1d, q1d, smem, B, X, Y);
}
Expand All @@ -486,8 +498,8 @@ inline MFEM_HOST_DEVICE void Grad3d(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
const real_t (&G)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &Y,
vd_regs3d_t<VDIM, DIM, MQ1> &X,
vd_regs3d_t<VDIM, DIM, MQ1> &Y,
const int k = -1)
{
// for (int c = 0; c < VDIM; c++)
Expand All @@ -508,8 +520,8 @@ inline MFEM_HOST_DEVICE void Grad3dTranspose(const int d1d, const int q1d,
real_t (&smem)[MQ1][MQ1],
const real_t (&B)[MD1][MQ1],
const real_t (&G)[MD1][MQ1],
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &X,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &Y,
vd_regs3d_t<VDIM, DIM, MQ1> &X,
vd_regs3d_t<VDIM, DIM, MQ1> &Y,
const int k = -1)
{
constexpr bool Transpose = true;
Expand All @@ -519,7 +531,7 @@ inline MFEM_HOST_DEVICE void Grad3dTranspose(const int d1d, const int q1d,
///////////////////////////////////////////////////////////////////////////////
template <int VDIM, int DIM, int MQ1>
inline MFEM_HOST_DEVICE void WriteDofs3d(const int e, const int d1d,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &X,
vd_regs3d_t<VDIM, DIM, MQ1> &X,
const DeviceTensor<5, real_t> &Y)
{
for (int dz = 0; dz < d1d; ++dz)
Expand All @@ -543,7 +555,7 @@ template <int VDIM, int DIM, int MQ1>
inline MFEM_HOST_DEVICE
void WriteDofs3dOneComponent(const int e, const int i, const int j,
const int d1d,
regs_t<VDIM, DIM, MQ1, MQ1, MQ1> &X,
vd_regs3d_t<VDIM, DIM, MQ1> &X,
const DeviceTensor<5, real_t> &Y)
{
for (int dz = 0; dz < d1d; ++dz)
Expand Down