On Mon, 2023-10-02 at 10:16 +0200, Thomas Hellstr=C3=B6m wrote: > Hi, Zack >=20 > On 9/26/23 19:51, Zack Rusin wrote: > > From: Zack Rusin > >=20 > > Some drivers require the mapped tt pages to be decrypted. In an > > ideal > > world this would have been handled by the dma layer, but the TTM > > page > > fault handling would have to be rewritten to able to do that. > >=20 > > A side-effect of the TTM page fault handling is using a dma > > allocation > > per order (via ttm_pool_alloc_page) which makes it impossible to > > just > > trivially use dma_mmap_attrs. As a result ttm has to be very > > careful > > about trying to make its pgprot for the mapped tt pages match what > > the dma layer thinks it is. At the ttm layer it's possible to > > deduce the requirement to have tt pages decrypted by checking > > whether coherent dma allocations have been requested and the system > > is running with confidential computing technologies. > >=20 > > This approach isn't ideal but keeping TTM matching DMAs > > expectations > > for the page properties is in general fragile, unfortunately proper > > fix would require a rewrite of TTM's page fault handling. > >=20 > > Fixes vmwgfx with SEV enabled. > >=20 > > v2: Explicitly include cc_platform.h > >=20 > > Signed-off-by: Zack Rusin > > Fixes: 3bf3710e3718 ("drm/ttm: Add a generic TTM memcpy move for > > page-based iomem") > > Cc: Christian K=C3=B6nig > > Cc: Thomas Hellstr=C3=B6m > > Cc: Huang Rui > > Cc: dri-devel@lists.freedesktop.org > > Cc: linux-kernel@vger.kernel.org > > Cc: # v5.14+ > > --- > > =C2=A0 drivers/gpu/drm/ttm/ttm_bo_util.c | 13 +++++++++++-- > > =C2=A0 drivers/gpu/drm/ttm/ttm_tt.c=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 |=C2= =A0 8 ++++++++ > > =C2=A0 include/drm/ttm/ttm_tt.h=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0 |=C2=A0 9 ++++++++- > > =C2=A0 3 files changed, 27 insertions(+), 3 deletions(-) > >=20 > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c > > b/drivers/gpu/drm/ttm/ttm_bo_util.c > > index fd9fd3d15101..0b3f4267130c 100644 > > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c > > @@ -294,7 +294,13 @@ pgprot_t ttm_io_prot(struct ttm_buffer_object > > *bo, struct ttm_resource *res, > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0enum ttm_caching cachin= g; > > =C2=A0=20 > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0man =3D ttm_manager_typ= e(bo->bdev, res->mem_type); > > -=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0caching =3D man->use_tt ? bo= ->ttm->caching : res- > > >bus.caching; > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (man->use_tt) { > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0caching =3D bo->ttm->caching; > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0if (bo->ttm->page_flags & TTM_TT_FLAG_DECRYPTED) > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0tmp = =3D pgprot_decrypted(tmp); > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0} else=C2=A0 { > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0caching =3D res->bus.caching; > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0} > > =C2=A0=20 > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0return ttm_prot_from_ca= ching(caching, tmp); > > =C2=A0 } > > @@ -337,6 +343,8 @@ static int ttm_bo_kmap_ttm(struct > > ttm_buffer_object *bo, > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0.no_wait_gpu =3D false > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0}; > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct ttm_tt *ttm =3D = bo->ttm; > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct ttm_resource_manager = *man =3D > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0ttm_ma= nager_type(bo->bdev, bo->resource- > > >mem_type); > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0pgprot_t prot; > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0int ret; > > =C2=A0=20 > > @@ -346,7 +354,8 @@ static int ttm_bo_kmap_ttm(struct > > ttm_buffer_object *bo, > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (ret) > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0return ret; > > =C2=A0=20 > > -=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (num_pages =3D=3D 1 && tt= m->caching =3D=3D ttm_cached) { > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (num_pages =3D=3D 1 && tt= m->caching =3D=3D ttm_cached && > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 !(man->us= e_tt && (ttm->page_flags & > > TTM_TT_FLAG_DECRYPTED))) { > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0/* > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0 * We're mapping a single page, and the desired > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0 * page protection is consistent with the bo. > > diff --git a/drivers/gpu/drm/ttm/ttm_tt.c > > b/drivers/gpu/drm/ttm/ttm_tt.c > > index e0a77671edd6..e4966e2c988d 100644 > > --- a/drivers/gpu/drm/ttm/ttm_tt.c > > +++ b/drivers/gpu/drm/ttm/ttm_tt.c > > @@ -31,6 +31,7 @@ > > =C2=A0=20 > > =C2=A0 #define pr_fmt(fmt) "[TTM] " fmt > > =C2=A0=20 > > +#include > > =C2=A0 #include > > =C2=A0 #include > > =C2=A0 #include > > @@ -81,6 +82,13 @@ int ttm_tt_create(struct ttm_buffer_object *bo, > > bool zero_alloc) > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0pr_err("Illegal buffer object type\n"); > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0return -EINVAL; > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0} > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0/* > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * When using dma_alloc_cohe= rent with memory encryption the > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * mapped TT pages need to b= e decrypted or otherwise the > > drivers > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * will end up sending encry= pted mem to the gpu. > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 */ > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (bdev->pool.use_dma_alloc= && > > cc_platform_has(CC_ATTR_MEM_ENCRYPT)) >=20 > You need to use CC_ATTR_GUEST_MEM_ENCRYPT here rather than=20 > CC_ATTR_MEM_ENCRYPT to avoid touching and breaking the SME case and > only=20 > fix the SEV / SEV-ES case. I'd also hold off the stable inclusion > until=20 > it's completely verified that this doesn't break anything because if > it=20 > does, I suspect all hell will break loose. >=20 > With that said, for the functionality >=20 > Reviewed-by: Thomas Hellstr=C3=B6m >=20 > But I think this needs a wider Ack at the ttm / drm level for the=20 > approach taken. >=20 > /Thomas. FWIW, I think that if TTM_TT_FLAG_DECRYPTED is set, it should be possible to add a debug WARN_ON_ONCE() if the first PTE of the dma page's kernel virtual address does not use a decrypted pgprot_t. One way of accessing the PTEs in a platform-generic fashion is apply_to_page_range(). /Thomas >=20 > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0page_flags |=3D TTM_TT_FLAG_DECRYPTED; > > =C2=A0=20 > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0bo->ttm =3D bdev->funcs= ->ttm_tt_create(bo, page_flags); > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (unlikely(bo->ttm = =3D=3D NULL)) > > diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h > > index a4eff85b1f44..2b9d856ff388 100644 > > --- a/include/drm/ttm/ttm_tt.h > > +++ b/include/drm/ttm/ttm_tt.h > > @@ -79,6 +79,12 @@ struct ttm_tt { > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 *=C2=A0=C2=A0 page_fla= gs =3D TTM_TT_FLAG_EXTERNAL | > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 *=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 TTM_T= T_FLAG_EXTERNAL_MAPPABLE; > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * TTM_TT_FLAG_DECRYPTED: Th= e mapped ttm pages should be > > marked as > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * not encrypted. The framew= ork will try to match what the > > dma layer > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * is doing, but note that i= t is a little fragile because > > ttm page > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * fault handling abuses the= DMA api a bit and > > dma_map_attrs can't be > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * used to assure pgprot alw= ays matches. > > +=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * TTM_TT_FLAG_PRIV_POP= ULATED: TTM internal only. DO NOT > > USE. This is > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * set by TTM after ttm= _tt_populate() has successfully > > returned, and is > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 * then unset when TTM = calls ttm_tt_unpopulate(). > > @@ -87,8 +93,9 @@ struct ttm_tt { > > =C2=A0 #define TTM_TT_FLAG_ZERO_ALLOC=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0BIT(1) > > =C2=A0 #define TTM_TT_FLAG_EXTERNAL=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0BIT(2) > > =C2=A0 #define TTM_TT_FLAG_EXTERNAL_MAPPABLE=C2=A0BIT(3) > > +#define TTM_TT_FLAG_DECRYPTED=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0BIT(4) > > =C2=A0=20 > > -#define TTM_TT_FLAG_PRIV_POPULATED=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0BIT(4) > > +#define TTM_TT_FLAG_PRIV_POPULATED=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0BIT(5) > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0uint32_t page_flags; > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0/** @num_pages: Number = of pages in the page array. */ > > =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0uint32_t num_pages;