34 Commits

Author SHA1 Message Date
  Nanley Chery 188193cbf2 iris: Add comments from Bspec fast-clear preamble page 3 days ago
  Nanley Chery 18e67d853f iris: Fix pipe control around fast-clears 3 days ago
  Georg Lehmann a2b70ce4ec aco/isel: remove uniform reduce/scan optimization 4 days ago
  Georg Lehmann 81245e262f radeonsi: use nir_opt_uniform_subgroup 4 days ago
  Georg Lehmann ec81337d8d radv: use nir_opt_uniform_subgroup 4 days ago
  Jose Maria Casanova Crespo f57add1d14 v3dv: Enable TFU blits with raster destinations on 7.1 HW (RPi5) 2 weeks ago
  Jose Maria Casanova Crespo 19580dfae1 v3d: Enable TFU blits with raster destinations on 7.1 HW (RPi5) 2 weeks ago
  Jose Maria Casanova Crespo 40339ada9c broadcom: Drop use of nir_lower_wrmasks 3 days ago
  Valentine Burley b1de4249f7 lavapipe/ci: Add a nightly ASAN job 6 days ago
  Karmjit Mahil cfd10a729d gallium: Fix gnu-empty-initalizer error 3 days ago
  Mary Guillemard 86d190e158 nvk: Use rendering state attachment count when setting SET_CT_SELECT 2 weeks ago
  Lionel Landwerlin e241e30986 anv: add a no-resource-barrier debug flag 3 weeks ago
  Lionel Landwerlin 5f58ac7b11 anv: implement WA_18039014283 1 year ago
  Lionel Landwerlin 15174b185b anv: instrument resource barriers instruction in u_trace 1 year ago
  Lionel Landwerlin 3520abf8a3 anv: use RESOURCE_BARRIER for event waiting when possible 1 year ago
  Lionel Landwerlin 5f9ece0b83 anv: implement Wa_18037648410 1 year ago
  Rohan Garg 24e9afb0b7 anv: implement resource barrier emissions 3 years ago
  Lionel Landwerlin e5fc567f49 anv: introduce an new virtual pipecontrol flag for BTI change 2 weeks ago
  Lionel Landwerlin 682f907228 intel: rename DCFlushEnable to ForceDeviceCoherency 1 year ago
  Rohan Garg e55a7bc83a anv: program STATE_COMPUTE_MODE to flush the L1 cache 1 year ago
  Lionel Landwerlin 47bc9da064 anv: use anv_add_pending_pipe_bits for event reset 1 year ago
  Lionel Landwerlin 8834ef8bcd anv: use flushing PIPE_CONTROL for event signaling 1 year ago
  Lionel Landwerlin a06b0213c8 anv: switch events to use 0/!0 values for unsignaled/signaled 1 year ago
  Lionel Landwerlin 5b0c2339d5 anv: use the blitter/video barrier helper for event signalling 1 year ago
  Lionel Landwerlin 5dd6f0d0ef anv: store event creation flags 1 year ago
  Lionel Landwerlin 72ee520b36 anv: remove unused event field 1 year ago
  Lionel Landwerlin 23be634934 anv: disable deferred bits on Gfx20+ 1 year ago
  Lionel Landwerlin be5f5f659f anv: consider CS coherent with L3 on Xe2+ 4 days ago
  Lionel Landwerlin 503355c7f8 anv: update pipeline barriers for Xe2+ 1 year ago
  Lionel Landwerlin 15524de710 anv: remove pb-stalls from various locations 1 year ago
  Lionel Landwerlin 86dceded22 anv: move cs/pb-stall detection to flushing function 2 weeks ago
  Lionel Landwerlin f2c571fabf anv: add tracking of involved stages in pipe flushes 1 year ago
  Lionel Landwerlin 4e8a25cf6f anv: remove use of emit_apply_pipe_flushes() in various helpers 2 weeks ago
  Lionel Landwerlin d37a888a9b anv: remove unused gpu_memcpy function 2 weeks ago
44 changed files with 1425 additions and 515 deletions
Split View
  1. +3
    -0
      docs/envvars.rst
  2. +0
    -5
      src/amd/compiler/aco_validate.cpp
  3. +0
    -168
      src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
  4. +8
    -0
      src/amd/vulkan/radv_pipeline.c
  5. +5
    -4
      src/broadcom/common/v3d_tfu.h
  6. +3
    -3
      src/broadcom/compiler/nir_to_vir.c
  7. +0
    -1
      src/broadcom/compiler/vir.c
  8. +12
    -9
      src/broadcom/vulkan/v3dv_meta_copy.c
  9. +8
    -5
      src/broadcom/vulkan/v3dvx_meta_common.c
  10. +1
    -1
      src/gallium/auxiliary/hud/hud_context.c
  11. +1
    -1
      src/gallium/auxiliary/postprocess/pp_mlaa.c
  12. +84
    -20
      src/gallium/drivers/iris/iris_clear.c
  13. +5
    -0
      src/gallium/drivers/iris/iris_state.c
  14. +8
    -0
      src/gallium/drivers/radeonsi/si_shader.c
  15. +10
    -5
      src/gallium/drivers/v3d/v3dx_tfu.c
  16. +0
    -1
      src/gallium/frontends/lavapipe/ci/deqp-lvp-asan.toml
  17. +9
    -0
      src/gallium/frontends/lavapipe/ci/gitlab-ci.yml
  18. +18
    -0
      src/gallium/frontends/lavapipe/ci/lvp-asan-fails.txt
  19. +1
    -0
      src/intel/dev/intel_debug.c
  20. +1
    -0
      src/intel/dev/intel_debug.h
  21. +125
    -27
      src/intel/ds/intel_driver_ds.cc
  22. +19
    -0
      src/intel/ds/intel_driver_ds.h
  23. +69
    -2
      src/intel/ds/intel_tracepoints.py
  24. +1
    -1
      src/intel/genxml/gen200.xml
  25. +24
    -2
      src/intel/vulkan/anv_blorp.c
  26. +2
    -0
      src/intel/vulkan/anv_cmd_buffer.c
  27. +5
    -4
      src/intel/vulkan/anv_event.c
  28. +18
    -29
      src/intel/vulkan/anv_genX.h
  29. +47
    -10
      src/intel/vulkan/anv_private.h
  30. +20
    -3
      src/intel/vulkan/anv_util.c
  31. +0
    -1
      src/intel/vulkan/anv_utrace.c
  32. +10
    -2
      src/intel/vulkan/genX_acceleration_structure.c
  33. +11
    -10
      src/intel/vulkan/genX_blorp_exec.c
  34. +819
    -152
      src/intel/vulkan/genX_cmd_buffer.c
  35. +4
    -2
      src/intel/vulkan/genX_cmd_compute.c
  36. +13
    -2
      src/intel/vulkan/genX_cmd_draw.c
  37. +3
    -3
      src/intel/vulkan/genX_cmd_draw_generated_flush.h
  38. +12
    -0
      src/intel/vulkan/genX_cmd_draw_generated_indirect.h
  39. +4
    -3
      src/intel/vulkan/genX_gfx_state.c
  40. +14
    -24
      src/intel/vulkan/genX_gpu_memcpy.c
  41. +3
    -0
      src/intel/vulkan/genX_init_state.c
  42. +19
    -2
      src/intel/vulkan/genX_query.c
  43. +5
    -12
      src/intel/vulkan/genX_simple_shader.c
  44. +1
    -1
      src/nouveau/vulkan/nvk_cmd_draw.c

+ 3
- 0
docs/envvars.rst View File

@@ -606,6 +606,9 @@ Intel driver environment variables
disable fast clears
``noccs``
disable lossless color compression
``no-resource-barrier``
disable RENDER_BARRIER instruction usage by falling back to
PIPE_CONTROL
``optimizer``
dump shader assembly to files at each optimization pass and
iteration that make progress (Gfx < 9)


+ 0
- 5
src/amd/compiler/aco_validate.cpp View File

@@ -784,11 +784,6 @@ validate_ir(Program* program)
check(instr->definitions[0].regClass().type() == RegType::sgpr ||
program->wave_size == 32,
"The result of unclustered reductions must go into an SGPR.", instr.get());
else
check(instr->definitions[0].regClass().type() == RegType::vgpr,
"The result of scans and clustered reductions must go into a VGPR.",
instr.get());

break;
}
case Format::SMEM: {


+ 0
- 168
src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp View File

@@ -3440,156 +3440,6 @@ emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
bld.copy(dst, src);
}

void
emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
{
Builder bld(ctx->program, ctx->block);
Temp src_tmp = get_ssa_temp(ctx, src.ssa);

if (op == nir_op_fadd) {
src_tmp = as_vgpr(ctx, src_tmp);
Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
: dst.getTemp();

if (src.ssa->bit_size == 16) {
count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
} else {
assert(src.ssa->bit_size == 32);
count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
}

if (tmp != dst.getTemp())
bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);

return;
}

if (dst.regClass() == s1)
src_tmp = bld.as_uniform(src_tmp);

if (op == nir_op_ixor && count.type() == RegType::sgpr)
count =
bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
else if (op == nir_op_ixor)
count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);

assert(dst.getTemp().type() == count.type());

if (nir_src_is_const(src)) {
uint32_t imm = nir_src_as_uint(src);
if (imm == 1 && dst.bytes() <= 2)
bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
else if (imm == 1)
bld.copy(dst, count);
else if (imm == 0)
bld.copy(dst, Operand::zero(dst.bytes()));
else if (count.type() == RegType::vgpr)
bld.v_mul_imm(dst, count, imm, true, true);
else if (imm == 0xffffffff)
bld.sop2(aco_opcode::s_sub_i32, dst, bld.def(s1, scc), Operand::zero(), count);
else if (util_is_power_of_two_or_zero(imm))
bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), count,
Operand::c32(ffs(imm) - 1u));
else
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
} else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
} else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
} else if (dst.getTemp().type() == RegType::vgpr) {
bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
} else {
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
}
}

bool
emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
{
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
if (op == nir_op_imul || op == nir_op_fmul)
return false;

if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
Builder bld(ctx->program, ctx->block);
Definition dst(get_ssa_temp(ctx, &instr->def));
unsigned bit_size = instr->src[0].ssa->bit_size;
if (bit_size > 32)
return false;

Temp thread_count =
bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
set_wqm(ctx);

emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
} else {
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
}

return true;
}

bool
emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
{
Builder bld(ctx->program, ctx->block);
Definition dst(get_ssa_temp(ctx, &instr->def));
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;

if (op == nir_op_imul || op == nir_op_fmul)
return false;

if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
if (instr->src[0].ssa->bit_size > 32)
return false;

Temp packed_tid;
if (inc)
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
else
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
set_wqm(ctx);

emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
return true;
}

assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);

if (inc) {
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
return true;
}

/* Copy the source and write the reduction operation identity to the first lane. */
Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
if (dst.bytes() == 8) {
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
uint32_t identity_hi = get_reduction_identity(reduce_op, 1);

lo =
bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
hi =
bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
} else {
uint32_t identity = get_reduction_identity(reduce_op, 0);
bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
as_vgpr(ctx, src));
}

set_wqm(ctx);
return true;
}

Temp
emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
Definition dst, Temp src)
@@ -4498,24 +4348,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
const unsigned bit_size = instr->src[0].ssa->bit_size;
assert(bit_size != 1);

if (!nir_src_is_divergent(&instr->src[0])) {
/* We use divergence analysis to assign the regclass, so check if it's
* working as expected */
ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
if (instr->intrinsic == nir_intrinsic_inclusive_scan ||
cluster_size != ctx->program->wave_size)
expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor ||
op == nir_op_imul || op == nir_op_fmul;
assert(instr->def.divergent == expected_divergent);

if (instr->intrinsic == nir_intrinsic_reduce) {
if (!instr->def.divergent && emit_uniform_reduce(ctx, instr))
break;
} else if (emit_uniform_scan(ctx, instr)) {
break;
}
}

src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
ReduceOp reduce_op = get_reduce_op(op, bit_size);



+ 8
- 0
src/amd/vulkan/radv_pipeline.c View File

@@ -473,6 +473,14 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
if (!radv_use_llvm_for_stage(pdev, stage->stage))
ac_nir_optimize_uniform_atomics(stage->nir);

NIR_PASS(_, stage->nir, nir_opt_uniform_subgroup,
&(struct nir_lower_subgroups_options){
.subgroup_size = stage->info.wave_size,
.ballot_bit_size = stage->info.wave_size,
.ballot_components = 1,
.lower_ballot_bit_count_to_mbcnt_amd = true,
});

NIR_PASS(_, stage->nir, nir_opt_idiv_const, 8);

NIR_PASS(_, stage->nir, nir_lower_idiv,


+ 5
- 4
src/broadcom/common/v3d_tfu.h View File

@@ -51,11 +51,12 @@
/* Disable level 0 write, just write following mipmaps */
#define V3D71_TFU_IOC_DIMTW (1 << 0)
#define V3D71_TFU_IOC_FORMAT_SHIFT 12
#define V3D71_TFU_IOC_FORMAT_RASTER 0
#define V3D71_TFU_IOC_FORMAT_LINEARTILE 3
#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR 6
#define V3D71_TFU_IOA_FORMAT_UIF_XOR 7
#define V3D71_TFU_IOC_FORMAT_UBLINEAR_1_COLUMN 4
#define V3D71_TFU_IOC_FORMAT_UBLINEAR_2_COLUMN 5
#define V3D71_TFU_IOC_FORMAT_UIF_NO_XOR 6
#define V3D71_TFU_IOC_FORMAT_UIF_XOR 7

#define V3D71_TFU_IOC_STRIDE_SHIFT 16
#define V3D71_TFU_IOC_NUMMM_SHIFT 4


+ 3
- 3
src/broadcom/compiler/nir_to_vir.c View File

@@ -721,9 +721,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
}
}

/* nir_lower_wrmasks should've ensured that any writemask on a store
* operation only has consecutive bits set, in which case we should've
* processed the full writemask above.
/* v3d_nir_lower_load_store_bitsize should've ensured that any writemask
* on a store operation only has consecutive bits set, in which case
* we should've processed the full writemask above.
*/
assert(writemask == 0);
}


+ 0
- 1
src/broadcom/compiler/vir.c View File

@@ -1845,7 +1845,6 @@ v3d_attempt_compile(struct v3d_compile *c)
glsl_get_natural_size_align_bytes);

NIR_PASS(_, c->s, v3d_nir_lower_global_2x32);
NIR_PASS(_, c->s, nir_lower_wrmasks);
NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
NIR_PASS(_, c->s, v3d_nir_lower_scratch);



+ 12
- 9
src/broadcom/vulkan/v3dv_meta_copy.c View File

@@ -1152,8 +1152,8 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
return false;
}

/* Destination can't be raster format */
if (!dst->tiled)
/* Destination can't be raster format on V3D 4.2 */
if (cmd_buffer->device->devinfo.ver < 71 && !dst->tiled)
return false;

/* We can only do full copies, so if the format is D24S8 both aspects need
@@ -1266,7 +1266,8 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
dst->planes[dst_plane].mem->bo->handle,
dst_offset,
dst_slice->tiling,
dst_slice->padded_height,
dst_slice->tiling == V3D_TILING_RASTER ?
dst_slice->stride : dst_slice->padded_height,
dst->planes[dst_plane].cpp,
src->planes[src_plane].mem->bo->handle,
src_offset,
@@ -1869,8 +1870,8 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,

assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);

/* Destination can't be raster format */
if (!image->tiled)
/* Destination can't be raster format on V3D 4.2 */
if (cmd_buffer->device->devinfo.ver < 71 && !image->tiled)
return false;

/* We can't copy D24S8 because buffer to image copies only copy one aspect
@@ -1968,7 +1969,8 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
dst_bo->handle,
dst_offset,
slice->tiling,
slice->padded_height,
slice->tiling == V3D_TILING_RASTER ?
slice->stride : slice->padded_height,
image->planes[plane].cpp,
src_bo->handle,
src_offset,
@@ -3398,8 +3400,8 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
if (src->vk.format != dst->vk.format)
return false;

/* Destination can't be raster format */
if (!dst->tiled)
/* Destination can't be raster format on V3D 4.2 */
if (cmd_buffer->device->devinfo.ver < 71 && !dst->tiled)
return false;

/* Source region must start at (0,0) */
@@ -3507,7 +3509,8 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
dst->planes[0].mem->bo->handle,
dst_offset,
dst_slice->tiling,
dst_slice->padded_height,
dst_slice->tiling == V3D_TILING_RASTER ?
dst_slice->stride : dst_slice->padded_height,
dst->planes[0].cpp,
src->planes[0].mem->bo->handle,
src_offset,


+ 8
- 5
src/broadcom/vulkan/v3dvx_meta_common.c View File

@@ -980,10 +980,13 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
#endif

#if V3D_VERSION >= 71
tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
(dst_tiling - V3D_TILING_LINEARTILE)) <<
V3D71_TFU_IOC_FORMAT_SHIFT;

if (dst_tiling == V3D_TILING_RASTER) {
tfu.v71.ioc = V3D71_TFU_IOC_FORMAT_RASTER << V3D71_TFU_IOC_FORMAT_SHIFT;
} else {
tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
(dst_tiling - V3D_TILING_LINEARTILE)) <<
V3D71_TFU_IOC_FORMAT_SHIFT;
}
switch (dst_tiling) {
case V3D_TILING_UIF_NO_XOR:
case V3D_TILING_UIF_XOR:
@@ -1012,10 +1015,10 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
break;
}

#if V3D_VERSION <= 42
/* The TFU can handle raster sources but always produces UIF results */
assert(dst_tiling != V3D_TILING_RASTER);

#if V3D_VERSION <= 42
/* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
* OPAD field for the destination (how many extra UIF blocks beyond
* those necessary to cover the height).


+ 1
- 1
src/gallium/auxiliary/hud/hud_context.c View File

@@ -495,7 +495,7 @@ hud_draw_results(struct hud_context *hud, struct pipe_resource *tex)
const struct pipe_sampler_state *sampler_states[] =
{ &hud->font_sampler_state };
struct hud_pane *pane;
struct pipe_resource *releasebuf[3] = {};
struct pipe_resource *releasebuf[3] = { 0 };

if (!huds_visible)
return;


+ 1
- 1
src/gallium/auxiliary/postprocess/pp_mlaa.c View File

@@ -73,7 +73,7 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,

const struct pipe_stencil_ref ref = { {1} };

struct pipe_resource *releasebuf[2] = {};
struct pipe_resource *releasebuf[2] = { 0 };

/* Insufficient initialization checks. */
assert(p);


+ 84
- 20
src/gallium/drivers/iris/iris_clear.c View File

@@ -268,26 +268,90 @@ fast_clear_color(struct iris_context *ice,

iris_resource_set_clear_color(ice, res, color);

/* Ivybridge PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
*
* "Any transition from any value in {Clear, Render, Resolve} to a
* different value in {Clear, Render, Resolve} requires end of pipe
* synchronization."
*
* In other words, fast clear ops are not properly synchronized with
* other drawing. We need to use a PIPE_CONTROL to ensure that the
* contents of the previous draw hit the render target before we resolve
* and again afterwards to ensure that the resolve is complete before we
* do any more regular drawing.
*/
iris_emit_end_of_pipe_sync(batch, "fast clear: pre-flush",
PIPE_CONTROL_RENDER_TARGET_FLUSH |
(devinfo->ver == 12 ? PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_TILE_CACHE_FLUSH : 0) |
(devinfo->verx10 == 120 ? PIPE_CONTROL_DEPTH_STALL : 0) |
(devinfo->verx10 == 125 ? PIPE_CONTROL_FLUSH_HDC |
PIPE_CONTROL_DATA_CACHE_FLUSH : 0) |
PIPE_CONTROL_PSS_STALL_SYNC);
if (devinfo->ver >= 20) {
/* From the Xe2 Bspec 57340 (r59562),
* "MCS/CCS Buffers, Fast Clear for Render Target(s)":
*
* Synchronization:
* Due to interaction of scaled clearing rectangle with pixel
* scoreboard, we require one of the following commands to be
* issued. [...]
*
* PIPE_CONTROL
* PSS Stall Sync Enable [...] 1b (Enable)
* Machine-wide Stall at Pixel Stage, wait for all Prior Pixel
* Work to Reach End of Pipe
* Render Target Cache Flush Enable [...] 1b (Enable)
* Post-Sync Op Flushes Render Cache before Unblocking Stall
*
* This synchronization step is required before and after the fast
* clear pass, to ensure correct ordering between pixels.
*/
iris_emit_pipe_control_flush(batch, "fast clear: pre-flush",
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_PSS_STALL_SYNC);
} else if (devinfo->verx10 >= 125) {
/* From the ACM Bspec 47704 (r52663), "Render Target Fast Clear":
*
* Preamble pre fast clear synchronization
*
* PIPE_CONTROL:
* PS sync stall = 1
* Tile Cache Flush = 1
* RT Write Flush = 1
* HDC Flush = 1
* DC Flush = 1
* Texture Invalidate = 1
*
* [...]
*
* Objective of the preamble flushes is to ensure all data is
* evicted from L1 caches prior to fast clear.
*/
iris_emit_pipe_control_flush(batch, "fast clear: pre-flush",
PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_DATA_CACHE_FLUSH |
PIPE_CONTROL_FLUSH_HDC |
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_TILE_CACHE_FLUSH |
PIPE_CONTROL_PSS_STALL_SYNC);
} else if (devinfo->verx10 >= 120) {
/* From the TGL Bspec 47704 (r52663), "Render Target Fast Clear":
*
* Preamble pre fast clear synchronization
*
* PIPE_CONTROL:
* Depth Stall = 1
* Tile Cache Flush = 1
* RT Write Flush = 1
* Texture Invalidate = 1
*
* [...]
*
* Objective of the preamble flushes is to ensure all data is
* evicted from L1 caches prior to fast clear.
*/
iris_emit_pipe_control_flush(batch, "fast clear: pre-flush",
PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_TILE_CACHE_FLUSH |
PIPE_CONTROL_DEPTH_STALL);
} else {
/* Ivybridge PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
*
* "Any transition from any value in {Clear, Render, Resolve} to a
* different value in {Clear, Render, Resolve} requires end of pipe
* synchronization."
*
* In other words, fast clear ops are not properly synchronized with
* other drawing. We need to use a PIPE_CONTROL to ensure that the
* contents of the previous draw hit the render target before we resolve
* and again afterwards to ensure that the resolve is complete before we
* do any more regular drawing.
*/
iris_emit_end_of_pipe_sync(batch, "fast clear: pre-flush",
PIPE_CONTROL_RENDER_TARGET_FLUSH);
}

/* Update the clear color now that previous rendering is complete. */
if (color_changed && res->aux.clear_color_bo)


+ 5
- 0
src/gallium/drivers/iris/iris_state.c View File

@@ -10421,7 +10421,12 @@ iris_emit_raw_pipe_control(struct iris_batch *batch,
#endif
pc.LRIPostSyncOperation = NoLRIOperation;
pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
#if GFX_VER >= 20
pc.ForceDeviceCoherency = flags & (PIPE_CONTROL_TILE_CACHE_FLUSH |
PIPE_CONTROL_DATA_CACHE_FLUSH);
#else
pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
#endif
pc.StoreDataIndex = 0;
pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
#if GFX_VERx10 < 125


+ 8
- 0
src/gallium/drivers/radeonsi/si_shader.c View File

@@ -985,6 +985,14 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *
if (nir->info.use_aco_amd)
progress |= ac_nir_optimize_uniform_atomics(nir);

NIR_PASS(progress, nir, nir_opt_uniform_subgroup,
&(struct nir_lower_subgroups_options){
.subgroup_size = shader->wave_size,
.ballot_bit_size = shader->wave_size,
.ballot_components = 1,
.lower_ballot_bit_count_to_mbcnt_amd = true,
});

NIR_PASS(progress, nir, si_nir_lower_abi, shader, &ctx->args);
/* Global access lowering must be called after lowering ABI which emits regular load_global intrinsics. */
NIR_PASS(progress, nir, ac_nir_lower_global_access);


+ 10
- 5
src/gallium/drivers/v3d/v3dx_tfu.c View File

@@ -56,9 +56,11 @@ v3dX(tfu)(struct pipe_context *pctx,
if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D)
return false;

#if V3D_VERSION == 42
/* Can't write to raster. */
if (dst_base_slice->tiling == V3D_TILING_RASTER)
return false;
#endif

/* When using TFU for blit, we are doing exact copies (both input and
* output format must be the same, no scaling, etc), so there is no
@@ -84,7 +86,6 @@ v3dX(tfu)(struct pipe_context *pctx,
assert(for_mipmap);
return false;
}

MESA_TRACE_FUNC();

v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
@@ -171,9 +172,13 @@ v3dX(tfu)(struct pipe_context *pctx,
if (last_level != base_level)
tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW;

tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE +
(dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
V3D71_TFU_IOC_FORMAT_SHIFT);
if (dst_base_slice->tiling == V3D_TILING_RASTER) {
tfu.v71.ioc |= V3D71_TFU_IOC_FORMAT_RASTER << V3D71_TFU_IOC_FORMAT_SHIFT;
} else {
tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE +
(dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
V3D71_TFU_IOC_FORMAT_SHIFT);
}

switch (dst_base_slice->tiling) {
case V3D_TILING_UIF_NO_XOR:
@@ -183,7 +188,7 @@ v3dX(tfu)(struct pipe_context *pctx,
V3D71_TFU_IOC_STRIDE_SHIFT;
break;
case V3D_TILING_RASTER:
tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) <<
tfu.v71.ioc |= (dst_base_slice->stride / dst->cpp) <<
V3D71_TFU_IOC_STRIDE_SHIFT;
break;
default:


+ 0
- 1
src/gallium/frontends/lavapipe/ci/deqp-lvp-asan.toml View File

@@ -3,4 +3,3 @@ deqp = "/deqp-vk/external/vulkancts/modules/vulkan/deqp-vk"
caselists = ["/deqp-vk/mustpass/vk-main.txt"]
tests_per_group = 1
renderer_check = "llvmpipe"
fraction = 1000

+ 9
- 0
src/gallium/frontends/lavapipe/ci/gitlab-ci.yml View File

@@ -31,11 +31,20 @@ lavapipe-vkcts-asan:
GPU_VERSION: lvp-asan
DEQP_FORCE_ASAN: 1
S3_ARTIFACT_NAME: mesa-x86_64-asan-debugoptimized
DEQP_FRACTION: 1000
needs:
- job: debian/x86_64_test-vk
optional: true
- job: debian-x86_64-asan

lavapipe-vkcts-asan-full:
extends:
- lavapipe-vkcts-asan
- .lavapipe-manual-rules
timeout: 2h
variables:
DEQP_FRACTION: 40

lavapipe-vkd3d:
extends:
- .lavapipe-test


+ 18
- 0
src/gallium/frontends/lavapipe/ci/lvp-asan-fails.txt View File

@@ -28,3 +28,21 @@ dEQP-VK.dynamic_state.monolithic.compute_transfer.single.compute.vertex_input_bi

# New fails in 1.4.1.1
dEQP-VK.pipeline.pipeline_library.bind_buffers_2.maintenance5.robustness2.triangle_list.buffers9.stride_offset_rnd654.true_size.beyond_size,Fail

# New fails from nightly job
dEQP-VK.image.queue_transfer.3d.4x3x1.r64g64b64a64_sint,Crash
dEQP-VK.image.subresource_layout.2d.1_level.r64g64b64a64_sint_offset,Crash
dEQP-VK.image.subresource_layout.3d.2_levels.r64g64b64a64_sint,Crash
dEQP-VK.pipeline.pipeline_library.extended_dynamic_state.two_draws_static.tess_domain_origin_upper_left,Fail
dEQP-VK.pipeline.shader_object_unlinked_binary.bind_buffers_2.maintenance5.robustness2.triangle_list.buffers9.stride_offset_rnd654.true_size.beyond_size,Fail
dEQP-VK.pipeline.shader_object_unlinked_spirv.bind_buffers_2.maintenance5.robustness2.triangle_list.buffers5.stride_offset_rnd654.true_size.beyond_size,Fail
dEQP-VK.shader_object.misc.state.pipeline.vert_tess_frag.depth.clamp,Fail
dEQP-VK.shader_object.misc.state.pipeline.vert_tess_geom_frag.rasterization_discard.disabled,Fail

# New failures with VKCTS 1.4.4.0
dEQP-VK.pipeline.monolithic.extended_dynamic_state.cmd_buffer_start.large_stride_with_offset_and_padding,Fail
dEQP-VK.pipeline.pipeline_library.extended_dynamic_state.before_good_static.tess_domain_origin_upper_left,Fail
dEQP-VK.pipeline.shader_object_linked_binary.multisample.sample_rate_a2c.dynamic_a2c,Fail
dEQP-VK.pipeline.shader_object_linked_spirv.bind_buffers_2.maintenance5.robustness2.triangle_list.buffers9.stride_offset_rnd654.true_size.beyond_size,Fail
dEQP-VK.pipeline.shader_object_unlinked_spirv.extended_dynamic_state.cmd_buffer_start.stride_with_offset_and_padding,Fail
dEQP-VK.shader_object.misc.state.pipeline.vert_tess_frag.color_write.true,Fail

+ 1
- 0
src/intel/dev/intel_debug.c View File

@@ -111,6 +111,7 @@ static const struct debug_control_bitset debug_control[] = {
OPT1("task", DEBUG_TASK),
OPT1("mesh", DEBUG_MESH),
OPT1("stall", DEBUG_STALL),
OPT1("no-resource-barrier", DEBUG_NO_RESOURCE_BARRIER),
OPT1("capture-all", DEBUG_CAPTURE_ALL),
OPT1("perf-symbol-names", DEBUG_PERF_SYMBOL_NAMES),
OPT1("swsb-stall", DEBUG_SWSB_STALL),


+ 1
- 0
src/intel/dev/intel_debug.h View File

@@ -54,6 +54,7 @@ enum intel_debug_flag {
DEBUG_URB,
DEBUG_CLIP,
DEBUG_STALL,
DEBUG_NO_RESOURCE_BARRIER,
DEBUG_BLORP,
DEBUG_NO_DUAL_OBJECT_GS,
DEBUG_OPTIMIZER,


+ 125
- 27
src/intel/ds/intel_driver_ds.cc View File

@@ -307,42 +307,112 @@ end_event(struct intel_ds_queue *queue, uint64_t ts_ns,
stage->start_ns[level] = 0;
}

static size_t
snprintf_stages(char *buf, size_t buf_size,
enum intel_ds_barrier_type type,
enum intel_ds_stages signal_stages,
enum intel_ds_stages wait_stages)
{
return
snprintf(buf, buf_size, "%s: %s%s%s%s%s%s%s->%s%s%s%s%s%s%s: ",
type == INTEL_DS_BARRIER_TYPE_IMMEDIATE ? "imm" :
type == INTEL_DS_BARRIER_TYPE_SIGNAL ? "signal" :
type == INTEL_DS_BARRIER_TYPE_WAIT ? "wait" : "unknown",
(signal_stages & INTEL_DS_STAGES_TOP_BIT) ? "+top" : "",
(signal_stages & INTEL_DS_STAGES_GEOM_BIT) ? "+geom" : "",
(signal_stages & INTEL_DS_STAGES_RASTER_BIT) ? "+rast" : "",
(signal_stages & INTEL_DS_STAGES_DEPTH_BIT) ? "+ds" : "",
(signal_stages & INTEL_DS_STAGES_PIXEL_BIT) ? "+pix" : "",
(signal_stages & INTEL_DS_STAGES_COLOR_BIT) ? "+col" : "",
(signal_stages & INTEL_DS_STAGES_GPGPU_BIT) ? "+cs" : "",
(wait_stages & INTEL_DS_STAGES_TOP_BIT) ? "+top" : "",
(wait_stages & INTEL_DS_STAGES_GEOM_BIT) ? "+geom" : "",
(wait_stages & INTEL_DS_STAGES_RASTER_BIT) ? "+rast" : "",
(wait_stages & INTEL_DS_STAGES_DEPTH_BIT) ? "+ds" : "",
(wait_stages & INTEL_DS_STAGES_PIXEL_BIT) ? "+pix" : "",
(wait_stages & INTEL_DS_STAGES_COLOR_BIT) ? "+col" : "",
(wait_stages & INTEL_DS_STAGES_GPGPU_BIT) ? "+cs" : "");
}

static size_t
snprintf_flags(char *buf, size_t buf_size, enum intel_ds_stall_flag bits)
{
return
snprintf(buf, buf_size, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
(bits & INTEL_DS_DEPTH_CACHE_FLUSH_BIT) ? "+depth_flush" : "",
(bits & INTEL_DS_DATA_CACHE_FLUSH_BIT) ? "+dc_flush" : "",
(bits & INTEL_DS_HDC_PIPELINE_FLUSH_BIT) ? "+hdc_flush" : "",
(bits & INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT) ? "+rt_flush" : "",
(bits & INTEL_DS_TILE_CACHE_FLUSH_BIT) ? "+tile_flush" : "",
(bits & INTEL_DS_L3_FABRIC_FLUSH_BIT) ? "+l3_fabric_flush" : "",
(bits & INTEL_DS_STATE_CACHE_INVALIDATE_BIT) ? "+state_inv" : "",
(bits & INTEL_DS_CONST_CACHE_INVALIDATE_BIT) ? "+const_inv" : "",
(bits & INTEL_DS_VF_CACHE_INVALIDATE_BIT) ? "+vf_inv" : "",
(bits & INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT) ? "+tex_inv" : "",
(bits & INTEL_DS_INST_CACHE_INVALIDATE_BIT) ? "+inst_inv" : "",
(bits & INTEL_DS_STALL_AT_SCOREBOARD_BIT) ? "+pb_stall" : "",
(bits & INTEL_DS_DEPTH_STALL_BIT) ? "+depth_stall" : "",
(bits & INTEL_DS_CS_STALL_BIT) ? "+cs_stall" : "",
(bits & INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) ? "+udp_flush" : "",
(bits & INTEL_DS_END_OF_PIPE_BIT) ? "+eop" : "",
(bits & INTEL_DS_CCS_CACHE_FLUSH_BIT) ? "+ccs_flush" : "");
}

static size_t
snprintf_reasons(char *buf, size_t buf_size,
const char *r1, const char *r2,
const char *r3, const char *r4)
{
return
snprintf(buf, buf_size, ": %s%s%s%s%s%s%s",
r1 ? r1 : "unknown",
r2 ? "; " : "", r2 ? r2 : "",
r3 ? "; " : "", r3 ? r3 : "",
r4 ? "; " : "", r4 ? r4 : "");
}

static void
custom_trace_payload_as_extra_end_stall(perfetto::protos::pbzero::GpuRenderStageEvent *event,
const struct trace_intel_end_stall *payload)
{
char buf[256];
size_t buf_size = 0;

{
auto data = event->add_extra_data();
data->set_name("stall_reason");

snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s : %s%s%s%s%s%s%s",
(payload->flags & INTEL_DS_DEPTH_CACHE_FLUSH_BIT) ? "+depth_flush" : "",
(payload->flags & INTEL_DS_DATA_CACHE_FLUSH_BIT) ? "+dc_flush" : "",
(payload->flags & INTEL_DS_HDC_PIPELINE_FLUSH_BIT) ? "+hdc_flush" : "",
(payload->flags & INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT) ? "+rt_flush" : "",
(payload->flags & INTEL_DS_TILE_CACHE_FLUSH_BIT) ? "+tile_flush" : "",
(payload->flags & INTEL_DS_L3_FABRIC_FLUSH_BIT) ? "+l3_fabric_flush" : "",
(payload->flags & INTEL_DS_STATE_CACHE_INVALIDATE_BIT) ? "+state_inv" : "",
(payload->flags & INTEL_DS_CONST_CACHE_INVALIDATE_BIT) ? "+const_inv" : "",
(payload->flags & INTEL_DS_VF_CACHE_INVALIDATE_BIT) ? "+vf_inv" : "",
(payload->flags & INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT) ? "+tex_inv" : "",
(payload->flags & INTEL_DS_INST_CACHE_INVALIDATE_BIT) ? "+inst_inv" : "",
(payload->flags & INTEL_DS_STALL_AT_SCOREBOARD_BIT) ? "+pb_stall" : "",
(payload->flags & INTEL_DS_DEPTH_STALL_BIT) ? "+depth_stall" : "",
(payload->flags & INTEL_DS_CS_STALL_BIT) ? "+cs_stall" : "",
(payload->flags & INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) ? "+udp_flush" : "",
(payload->flags & INTEL_DS_END_OF_PIPE_BIT) ? "+eop" : "",
(payload->flags & INTEL_DS_CCS_CACHE_FLUSH_BIT) ? "+ccs_flush" : "",
(payload->reason1) ? payload->reason1 : "unknown",
(payload->reason2) ? "; " : "",
(payload->reason2) ? payload->reason2 : "",
(payload->reason3) ? "; " : "",
(payload->reason3) ? payload->reason3 : "",
(payload->reason4) ? "; " : "",
(payload->reason4) ? payload->reason4 : "");
data->set_name("reason");

buf_size += snprintf_flags(buf + buf_size, sizeof(buf) - buf_size,
(enum intel_ds_stall_flag) payload->flags);
buf_size += snprintf_reasons(buf + buf_size, sizeof(buf) - buf_size,
payload->reason1, payload->reason2,
payload->reason3, payload->reason4);
assert(strlen(buf) > 0);

data->set_value(buf);
}
}

static void
custom_trace_payload_as_extra_end_barrier(perfetto::protos::pbzero::GpuRenderStageEvent *event,
const struct trace_intel_end_barrier *payload)
{
char buf[256];
size_t buf_size = 0;

{
auto data = event->add_extra_data();
data->set_name("reason");

buf_size += snprintf_stages(buf + buf_size, sizeof(buf) - buf_size,
(enum intel_ds_barrier_type) payload->type,
(enum intel_ds_stages) payload->signal_stages,
(enum intel_ds_stages) payload->wait_stages);
buf_size += snprintf_flags(buf + buf_size, sizeof(buf) - buf_size,
(enum intel_ds_stall_flag) payload->flags);
buf_size += snprintf_reasons(buf + buf_size, sizeof(buf) - buf_size,
payload->reason1, payload->reason2,
payload->reason3, payload->reason4);
assert(strlen(buf) > 0);

data->set_value(buf);
@@ -514,6 +584,34 @@ intel_ds_end_stall(struct intel_ds_device *device,
(trace_payload_as_extra_func)custom_trace_payload_as_extra_end_stall);
}

void
intel_ds_begin_barrier(struct intel_ds_device *device,
uint64_t ts_ns,
uint16_t tp_idx,
const void *flush_data,
const struct trace_intel_begin_barrier *payload,
const void *indirect_data)
{
const struct intel_ds_flush_data *flush =
(const struct intel_ds_flush_data *) flush_data;
begin_event(flush->queue, ts_ns, INTEL_DS_QUEUE_STAGE_STALL);
}

void
intel_ds_end_barrier(struct intel_ds_device *device,
uint64_t ts_ns,
uint16_t tp_idx,
const void *flush_data,
const struct trace_intel_end_barrier *payload,
const void *indirect_data)
{
const struct intel_ds_flush_data *flush =
(const struct intel_ds_flush_data *) flush_data;
end_event(flush->queue, ts_ns, INTEL_DS_QUEUE_STAGE_STALL,
flush->submission_id, tp_idx, NULL, payload, indirect_data,
(trace_payload_as_extra_func)custom_trace_payload_as_extra_end_barrier);
}

uint64_t
intel_ds_begin_submit(struct intel_ds_queue *queue)
{


+ 19
- 0
src/intel/ds/intel_driver_ds.h View File

@@ -65,6 +65,22 @@ enum intel_ds_stall_flag {
INTEL_DS_L3_FABRIC_FLUSH_BIT = BITFIELD_BIT(17),
};

enum intel_ds_barrier_type {
INTEL_DS_BARRIER_TYPE_IMMEDIATE,
INTEL_DS_BARRIER_TYPE_SIGNAL,
INTEL_DS_BARRIER_TYPE_WAIT,
};

enum intel_ds_stages {
INTEL_DS_STAGES_TOP_BIT = BITFIELD_BIT(0),
INTEL_DS_STAGES_GEOM_BIT = BITFIELD_BIT(1),
INTEL_DS_STAGES_RASTER_BIT = BITFIELD_BIT(2),
INTEL_DS_STAGES_DEPTH_BIT = BITFIELD_BIT(3),
INTEL_DS_STAGES_PIXEL_BIT = BITFIELD_BIT(4),
INTEL_DS_STAGES_COLOR_BIT = BITFIELD_BIT(5),
INTEL_DS_STAGES_GPGPU_BIT = BITFIELD_BIT(6),
};

enum intel_ds_tracepoint_flags {
/**
* Whether the tracepoint's timestamp must be recorded with as an
@@ -85,6 +101,9 @@ enum intel_ds_tracepoint_flags {
/* Convert internal driver PIPE_CONTROL stall bits to intel_ds_stall_flag. */
typedef enum intel_ds_stall_flag (*intel_ds_stall_cb_t)(uint32_t flags);

/* Convert internal driver RESOUCE_BARRIER stages bits to intel_ds_stage. */
typedef enum intel_ds_stages (*intel_ds_stages_cb_t)(uint8_t stages);

enum intel_ds_queue_stage {
INTEL_DS_QUEUE_STAGE_QUEUE,
INTEL_DS_QUEUE_STAGE_FRAME,


+ 69
- 2
src/intel/ds/intel_tracepoints.py View File

@@ -272,10 +272,46 @@ def define_tracepoints(args):
bits.append(Arg(type='bool', name=a[1], var='__entry->flags & INTEL_DS_{0}_BIT'.format(a[0]), c_format='%u'))
return bits

def stall_args(args):
def stall_args(stall_bits):
fmt = ''
exprs = []
for a in args:
for a in stall_bits:
fmt += '%s'
exprs.append('(__entry->flags & INTEL_DS_{0}_BIT) ? "+{1}" : ""'.format(a[0], a[1]))
fmt += ' : %s%s%s%s%s%s%s'
exprs.append('(__entry->reason1) ? __entry->reason1 : "unknown"')
exprs.append('(__entry->reason2) ? "; " : ""')
exprs.append('(__entry->reason2) ? __entry->reason2 : ""')
exprs.append('(__entry->reason3) ? "; " : ""')
exprs.append('(__entry->reason3) ? __entry->reason3 : ""')
exprs.append('(__entry->reason4) ? "; " : ""')
exprs.append('(__entry->reason4) ? __entry->reason4 : ""')
# To printout flags
# fmt += '(0x%08x)'
# exprs.append('__entry->flags')
fmt = [fmt]
fmt += exprs
return fmt

def barrier_args(stage_bits, stall_bits):
fmt = ''
exprs = []

fmt += '%s '
exprs.append('(__entry->type == INTEL_DS_BARRIER_TYPE_IMMEDIATE) ? "IMMEDIATE" : ' +
'(__entry->type == INTEL_DS_BARRIER_TYPE_SIGNAL) ? "SIGNAL" : ' +
'(__entry->type == INTEL_DS_BARRIER_TYPE_WAIT) ? "WAIT" : "unknown"')

for a in stages_bits:
fmt += '%s'
exprs.append('(__entry->signal_stages & INTEL_DS_STAGES_{0}_BIT) ? "+{1}" : ""'.format(a[0], a[1]))
fmt += '->'
for a in stages_bits:
fmt += '%s'
exprs.append('(__entry->wait_stages & INTEL_DS_STAGES_{0}_BIT) ? "+{1}" : ""'.format(a[0], a[1]))
fmt += ': '

for a in stall_bits:
fmt += '%s'
exprs.append('(__entry->flags & INTEL_DS_{0}_BIT) ? "+{1}" : ""'.format(a[0], a[1]))
fmt += ' : %s%s%s%s%s%s%s'
@@ -311,6 +347,37 @@ def define_tracepoints(args):
['END_OF_PIPE', 'eop'],
['CCS_CACHE_FLUSH', 'ccs_flush']]

stages_bits = [['TOP', 'top'],
['GEOM', 'geom'],
['RASTER', 'raster'],
['DEPTH', 'depth'],
['PIXEL', 'pixel'],
['COLOR', 'color'],
['GPGPU', 'gpgpu']]

begin_end_tp('barrier',
tp_args=[ArgStruct(type='uint8_t', var='type'),
ArgStruct(type='uint32_t', var='signal_stages'),
ArgStruct(type='uint32_t', var='wait_stages'),
ArgStruct(type='intel_ds_stages_cb_t', var='decode_stage_cb'),
ArgStruct(type='uint32_t', var='flags'),
ArgStruct(type='intel_ds_stall_cb_t', var='decode_flags_cb'),
ArgStruct(type='const char *', var='reason1'),
ArgStruct(type='const char *', var='reason2'),
ArgStruct(type='const char *', var='reason3'),
ArgStruct(type='const char *', var='reason4'),],
tp_struct=[Arg(type='uint8_t', name='type', var='type', c_format='0x%hhx'),
Arg(type='uint8_t', name='signal_stages', var='decode_stage_cb(signal_stages)', c_format='0x%hhx'),
Arg(type='uint8_t', name='wait_stages', var='decode_stage_cb(wait_stages)', c_format='0x%hhx'),
Arg(type='uint32_t', name='flags', var='decode_flags_cb(flags)', c_format='0x%x'),
Arg(type='const char *', name='reason1', var='reason1', c_format='%s'),
Arg(type='const char *', name='reason2', var='reason2', c_format='%s'),
Arg(type='const char *', name='reason3', var='reason3', c_format='%s'),
Arg(type='const char *', name='reason4', var='reason4', c_format='%s'),],
tp_print=barrier_args(stages_bits, stall_flags),
tp_default_enabled=False,
end_pipelined=False)

begin_end_tp('stall',
tp_args=[ArgStruct(type='uint32_t', var='flags'),
ArgStruct(type='intel_ds_stall_cb_t', var='decode_cb'),


+ 1
- 1
src/intel/genxml/gen200.xml View File

@@ -1595,7 +1595,7 @@
<field name="State Cache Invalidation Enable" dword="1" bits="2:2" type="bool" />
<field name="Constant Cache Invalidation Enable" dword="1" bits="3:3" type="bool" />
<field name="VF Cache Invalidation Enable" dword="1" bits="4:4" type="bool" />
<field name="DC Flush Enable" dword="1" bits="5:5" type="bool" />
<field name="Force Device Coherency" dword="1" bits="5:5" type="bool" />
<field name="Pipe Control Flush Enable" dword="1" bits="7:7" type="bool" />
<field name="Notify Enable" dword="1" bits="8:8" type="bool" />
<field name="Indirect State Pointers Disable" dword="1" bits="9:9" type="bool" />


+ 24
- 2
src/intel/vulkan/anv_blorp.c View File

@@ -667,7 +667,12 @@ void anv_CmdCopyImage2(
anv_cmd_buffer_is_compute_queue(cmd_buffer) ?
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT :
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
anv_add_pending_pipe_bits(cmd_buffer, pipe_bits,
anv_add_pending_pipe_bits(cmd_buffer,
(batch.flags & BLORP_BATCH_USE_COMPUTE) ?
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT :
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
pipe_bits,
"Copy flush before astc emu");

for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
@@ -819,7 +824,12 @@ void anv_CmdCopyBufferToImage2(
anv_cmd_buffer_is_compute_queue(cmd_buffer) ?
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT :
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
anv_add_pending_pipe_bits(cmd_buffer, pipe_bits,
anv_add_pending_pipe_bits(cmd_buffer,
(batch.flags & BLORP_BATCH_USE_COMPUTE) ?
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT :
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
pipe_bits,
"Copy flush before astc emu");

for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
@@ -1177,6 +1187,8 @@ anv_cmd_buffer_update_addr(
* texture cache so we don't get anything stale.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_HOST_BIT,
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
"before UpdateBuffer");

@@ -1886,6 +1898,8 @@ anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
* hangs when doing a clear with WM_HZ_OP.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
ANV_PIPE_DEPTH_STALL_BIT,
"before clear hiz");
@@ -1913,6 +1927,8 @@ anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
unsigned wa_flush = cmd_buffer->device->info->verx10 >= 125 ?
ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_TILE_CACHE_FLUSH_BIT |
@@ -1955,6 +1971,8 @@ anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
*/
if (cmd_buffer->device->info->verx10 < 120) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
ANV_PIPE_DEPTH_STALL_BIT,
"after clear hiz");
@@ -2565,6 +2583,8 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
* cache before rendering to it.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
"before clear DS");
@@ -2584,6 +2604,8 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
* cache before someone starts trying to do stencil on it.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
"after clear DS");


+ 2
- 0
src/intel/vulkan/anv_cmd_buffer.c View File

@@ -1796,6 +1796,8 @@ anv_begin_companion_cmd_buffer_helper(struct anv_cmd_buffer **cmd_buffer,
*/
if (prev_cmd_buffer->device->info->has_aux_map) {
anv_add_pending_pipe_bits(prev_cmd_buffer->companion_rcs_cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
"new cmd buffer with aux-tt");
}


+ 5
- 4
src/intel/vulkan/anv_event.c View File

@@ -20,9 +20,10 @@ VkResult anv_CreateEvent(
if (event == NULL)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

event->flags = pCreateInfo->flags;
event->state = anv_state_pool_alloc(&device->dynamic_state_pool,
sizeof(uint64_t), 8);
*(uint64_t *)event->state.map = VK_EVENT_RESET;
*(uint64_t *)event->state.map = 0;

ANV_RMV(event_create, device, event, pCreateInfo->flags, false);

@@ -59,7 +60,7 @@ VkResult anv_GetEventStatus(
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;

return *(uint64_t *)event->state.map;
return *(uint64_t *)event->state.map ? VK_EVENT_SET : VK_EVENT_RESET;
}

VkResult anv_SetEvent(
@@ -68,7 +69,7 @@ VkResult anv_SetEvent(
{
ANV_FROM_HANDLE(anv_event, event, _event);

*(uint64_t *)event->state.map = VK_EVENT_SET;
*(uint64_t *)event->state.map = 1;

return VK_SUCCESS;
}
@@ -79,7 +80,7 @@ VkResult anv_ResetEvent(
{
ANV_FROM_HANDLE(anv_event, event, _event);

*(uint64_t *)event->state.map = VK_EVENT_RESET;
*(uint64_t *)event->state.map = 0;

return VK_SUCCESS;
}

+ 18
- 29
src/intel/vulkan/anv_genX.h View File

@@ -108,12 +108,6 @@ void genX(batch_emit_vertex_input)(struct anv_batch *batch,
struct anv_shader *shader,
const struct vk_vertex_input_state *vi);

enum anv_pipe_bits
genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
struct anv_device *device,
uint32_t current_pipeline,
enum anv_pipe_bits bits,
enum anv_pipe_bits *emitted_flush_bits);
void
genX(invalidate_aux_map)(struct anv_batch *batch,
struct anv_device *device,
@@ -174,24 +168,6 @@ genX(cmd_buffer_set_coarse_pixel_active)(struct anv_cmd_buffer *cmd_buffer,
#endif
}

static inline void
genX(cmd_buffer_post_dispatch_wa)(struct anv_cmd_buffer *cmd_buffer)
{
/* TODO: Add INTEL_NEEDS_WA_14025112257 check once HSD is propogated for all
* other impacted platforms.
*/
if (cmd_buffer->device->info->ver >= 20 &&
anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
enum anv_pipe_bits emitted_bits = 0;
genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
cmd_buffer->device,
cmd_buffer->state.current_pipeline,
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
&emitted_bits);
cmd_buffer->state.pending_pipe_bits &= ~emitted_bits;
}
}

void
genX(setup_autostrip_state)(struct anv_cmd_buffer *cmd_buffer, bool enable);

@@ -200,7 +176,8 @@ void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
struct anv_cmd_buffer *cmd_buffer,
struct anv_batch *batch);

void genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state);
void genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state,
bool wait_completion);

void genX(emit_so_memcpy_end)(struct anv_memcpy_state *state);

@@ -262,10 +239,6 @@ genX(emit_urb_setup)(struct anv_batch *batch,
void genX(emit_sample_pattern)(struct anv_batch *batch,
const struct vk_sample_locations_state *sl);

void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
struct anv_address dst, struct anv_address src,
uint32_t size);

void genX(blorp_init_dynamic_states)(struct blorp_context *context);

void genX(blorp_exec)(struct blorp_batch *batch,
@@ -542,3 +515,19 @@ void genX(write_rt_shader_group)(struct anv_device *device,

uint32_t genX(shader_cmd_size)(struct anv_device *device,
mesa_shader_stage stage);

static inline void
genX(cmd_buffer_post_dispatch_wa)(struct anv_cmd_buffer *cmd_buffer)
{
/* TODO: Add INTEL_NEEDS_WA_14025112257 check once HSD is propogated for all
* other impacted platforms.
*/
if (cmd_buffer->device->info->ver >= 20 &&
anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
genX(batch_emit_pipe_control)(&cmd_buffer->batch,
cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
"Wa_14025112257");
}
}

+ 47
- 10
src/intel/vulkan/anv_private.h View File

@@ -4020,6 +4020,25 @@ enum anv_pipe_bits {
*/
ANV_PIPE_POST_SYNC_BIT = (1 << 24),

/* This bit does not exist directly in PIPE_CONTROL. It indicates that the
* end-of-pipe write needs to be flushed out of L3. On Xe2+ this means that
* we cannot use RESOURCE_BARRIER to write that value since it'll stay in
* L3.
*/
ANV_PIPE_END_OF_PIPE_SYNC_FORCE_FLUSH_L3_BIT = (1 << 25),

/* This bit does not exist directly in PIPE_CONTROL. It helps to track post
* fast clear flushes. BSpec 57340 says in relation to fast clear flushes
* that "RESOURCE_BARRIER allows hardware to opportunistically combine this
* operation with previous RESOURCE_BARRIER commands potentially reducing
* overall synchronization cost", that appears to be untrue as experienced
* with
* dEQP-VK.synchronization.op.single_queue.barrier.write_clear_color_image_read_copy_image_to_buffer.image_128x128_r8_unorm
*
* If a PIPE_CONTROL is emitted this should be converted to
* ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT.
*/
ANV_PIPE_RT_BTI_CHANGE = (1 << 26),
};

/* These bits track the state of buffer writes for queries. They get cleared
@@ -4035,20 +4054,21 @@ enum anv_query_bits {
ANV_QUERY_WRITES_DATA_FLUSH = (1 << 3),
};

/* It's not clear why DG2 doesn't have issues with L3/CS coherency. But it's
* likely related to performance workaround 14015868140.
/* It's not clear why DG2/Xe2+ doesn't have issues with L3/CS coherency. But
* it's likely related to performance workaround 14015868140.
*
* For now we enable this only on DG2 and platform prior to Gfx12 where there
* is no tile cache.
* For now we enable this only on DG2/Xe2+ and platform prior to Gfx12 where
* there is no tile cache.
*/
#define ANV_DEVINFO_HAS_COHERENT_L3_CS(devinfo) \
(intel_device_info_is_dg2(devinfo))
(intel_device_info_is_dg2(devinfo) || (devinfo)->ver >= 20)

/* Things we need to flush before accessing query data using the command
* streamer.
*
* Prior to DG2 experiments show that the command streamer is not coherent
* with the tile cache so we need to flush it to make any data visible to CS.
* Prior to DG2/Xe2+ experiments show that the command streamer is not
* coherent with the tile cache so we need to flush it to make any data
* visible to CS.
*
* Otherwise we want to flush the RT cache which is where blorp writes, either
* for clearing the query buffer or for clearing the destination buffer in
@@ -4093,6 +4113,12 @@ enum anv_query_bits {
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \
ANV_PIPE_TILE_CACHE_FLUSH_BIT)

#define ANV_PIPE_L1_L2_BARRIER_FLUSH_BITS ( \
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT | \
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)

#define ANV_PIPE_STALL_BITS ( \
ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
ANV_PIPE_DEPTH_STALL_BIT | \
@@ -4633,6 +4659,8 @@ struct anv_cmd_state {
struct anv_cmd_compute_state compute;
struct anv_cmd_ray_tracing_state rt;

VkPipelineStageFlags2 pending_src_stages;
VkPipelineStageFlags2 pending_dst_stages;
enum anv_pipe_bits pending_pipe_bits;

/**
@@ -5157,7 +5185,7 @@ void anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer,

struct anv_event {
struct vk_object_base base;
uint64_t semaphore;
VkEventCreateFlags flags;
struct anv_state state;
};

@@ -6766,21 +6794,30 @@ anv_dump_pipe_bits(enum anv_pipe_bits bits, struct log_stream *stream);

void
anv_cmd_buffer_pending_pipe_debug(struct anv_cmd_buffer *cmd_buffer,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages,
enum anv_pipe_bits bits,
const char* reason);

static inline void
anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages,
enum anv_pipe_bits bits,
const char* reason)
{
cmd_buffer->state.pending_src_stages |= src_stages;
cmd_buffer->state.pending_dst_stages |= dst_stages;
cmd_buffer->state.pending_pipe_bits |= bits;
if (unlikely(u_trace_enabled(&cmd_buffer->device->ds.trace_context))) {
if (cmd_buffer->batch.pc_reasons_count < ARRAY_SIZE(cmd_buffer->batch.pc_reasons))
cmd_buffer->batch.pc_reasons[cmd_buffer->batch.pc_reasons_count++] = reason;
}
if (INTEL_DEBUG(DEBUG_PIPE_CONTROL))
anv_cmd_buffer_pending_pipe_debug(cmd_buffer, bits, reason);
if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
anv_cmd_buffer_pending_pipe_debug(cmd_buffer,
src_stages, dst_stages, bits,
reason);
}
}

struct anv_performance_configuration_intel {


+ 20
- 3
src/intel/vulkan/anv_util.c View File

@@ -75,19 +75,34 @@ __anv_perf_warn(struct anv_device *device,

void
anv_cmd_buffer_pending_pipe_debug(struct anv_cmd_buffer *cmd_buffer,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages,
enum anv_pipe_bits bits,
const char* reason)
{
if (bits == 0)
if (bits == 0 && src_stages == 0 && dst_stages == 0)
return;

struct log_stream *stream = mesa_log_streami();

mesa_log_stream_printf(stream, "acc: ");

mesa_log_stream_printf(stream, "bits: ");
mesa_log_stream_printf(stream, "src: ");
u_foreach_bit64(b, src_stages) {
mesa_log_stream_printf(stream, "%s,",
vk_PipelineStageFlagBits2_to_str(BITFIELD_BIT(b)) +
strlen("VK_PIPELINE_STAGE_2_"));
}
mesa_log_stream_printf(stream, " dst: ");
u_foreach_bit64(b, dst_stages) {
mesa_log_stream_printf(stream, "%s,",
vk_PipelineStageFlagBits2_to_str(BITFIELD_BIT(b)) +
strlen("VK_PIPELINE_STAGE_2_"));
}

mesa_log_stream_printf(stream, " bits: ");
anv_dump_pipe_bits(bits, stream);
mesa_log_stream_printf(stream, "reason: %s", reason);
mesa_log_stream_printf(stream, " reason: %s", reason);

mesa_log_stream_printf(stream, "\n");

@@ -132,6 +147,8 @@ anv_dump_pipe_bits(enum anv_pipe_bits bits, struct log_stream *stream)
mesa_log_stream_printf(stream, "+utdp_flush ");
if (bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT)
mesa_log_stream_printf(stream, "+ccs_flush ");
if (bits & ANV_PIPE_RT_BTI_CHANGE)
mesa_log_stream_printf(stream, "+rt-bti-change ");
}

const char *


+ 0
- 1
src/intel/vulkan/anv_utrace.c View File

@@ -219,7 +219,6 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
anv_device_utrace_emit_gfx_copy_buffer);
}
}
anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);

trace_intel_end_trace_copy_cb(&submit->ds.trace, batch, num_traces);



+ 10
- 2
src/intel/vulkan/genX_acceleration_structure.c View File

@@ -437,7 +437,10 @@ anv_init_header(VkCommandBuffer commandBuffer, const struct vk_acceleration_stru
* dispatch size paramters) is not L3 coherent.
*/
if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) {
anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_DATA_CACHE_FLUSH_BIT,
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR,
ANV_PIPE_DATA_CACHE_FLUSH_BIT,
"copy dispatch size for dispatch");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
@@ -670,7 +673,10 @@ genX(CmdCopyAccelerationStructureKHR)(
* dispatch paramters) is not L3 coherent.
*/
if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) {
anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_DATA_CACHE_FLUSH_BIT,
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TRANSFER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_DATA_CACHE_FLUSH_BIT,
"bvh size read for dispatch");
}

@@ -720,6 +726,8 @@ genX(CmdCopyAccelerationStructureToMemoryKHR)(
*/
if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TRANSFER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_DATA_CACHE_FLUSH_BIT,
"bvh size read for dispatch");
}


+ 11
- 10
src/intel/vulkan/genX_blorp_exec.c View File

@@ -313,8 +313,9 @@ blorp_exec_on_render(struct blorp_batch *batch,
*/
if (blorp_uses_bti_rt_writes(batch, params)) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
ANV_PIPE_RT_BTI_CHANGE,
"before blorp BTI change");
}
#endif
@@ -330,12 +331,11 @@ blorp_exec_on_render(struct blorp_batch *batch,
hw_state->ds_write_state = blorp_ds_state;
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_WA_18019816803);

/* Add the stall that will flush prior to the blorp operation by
* genX(cmd_buffer_apply_pipe_flushes)
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_PSS_STALL_SYNC_BIT,
"Wa_18019816803");
genX(batch_emit_pipe_control)(&cmd_buffer->batch,
cmd_buffer->device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_PSS_STALL_SYNC_BIT,
"Wa_18019816803");
}
}
#endif
@@ -381,8 +381,9 @@ blorp_exec_on_render(struct blorp_batch *batch,
*/
if (blorp_uses_bti_rt_writes(batch, params)) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
ANV_PIPE_RT_BTI_CHANGE,
"after blorp BTI change");
}
#endif


+ 819
- 152
src/intel/vulkan/genX_cmd_buffer.c View File

@@ -55,7 +55,12 @@ static enum anv_pipe_bits
convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
enum anv_pipe_bits bits = 0;
bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
#if GFX_VER >= 20
bits |= (pc->ForceDeviceCoherency) ? (ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_TILE_CACHE_FLUSH_BIT) : 0;
#else
bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
#endif
#if GFX_VERx10 >= 125
bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
#endif
@@ -587,6 +592,8 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
final_needs_depth && !initial_depth_valid) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_TILE_CACHE_FLUSH_BIT,
"HIZ-CCS flush");
}
@@ -658,6 +665,8 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
*/
if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_TILE_CACHE_FLUSH_BIT,
"HIZ-CCS flush");
}
@@ -936,6 +945,8 @@ genX(cmd_buffer_load_clear_color)(struct anv_cmd_buffer *cmd_buffer,
* In testing, SKL doesn't actually seem to need this, but HSW does.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
"after load_clear_color surface state update");
#endif
@@ -1632,14 +1643,564 @@ genX(invalidate_aux_map)(struct anv_batch *batch,
#endif
}

ALWAYS_INLINE enum anv_pipe_bits
#if GFX_VER >= 20
ALWAYS_INLINE static void
anv_dump_rsc_stage(struct log_stream *stream,
const enum GENX(RESOURCE_BARRIER_STAGE) stage)
{
u_foreach_bit(bit, stage) {
switch(1 << bit) {
case RESOURCE_BARRIER_STAGE_NONE:
mesa_log_stream_printf(stream, "None,");
break;
case RESOURCE_BARRIER_STAGE_TOP:
mesa_log_stream_printf(stream, "Top,");
break;
case RESOURCE_BARRIER_STAGE_COLOR:
mesa_log_stream_printf(stream, "Color,");
break;
case RESOURCE_BARRIER_STAGE_GPGPU:
mesa_log_stream_printf(stream, "GPGPU,");
break;
case RESOURCE_BARRIER_STAGE_GEOM:
mesa_log_stream_printf(stream, "Geometry,");
break;
case RESOURCE_BARRIER_STAGE_RASTER:
mesa_log_stream_printf(stream, "Raster,");
break;
case RESOURCE_BARRIER_STAGE_DEPTH:
mesa_log_stream_printf(stream, "Depth,");
break;
case RESOURCE_BARRIER_STAGE_PIXEL:
mesa_log_stream_printf(stream, "Pixel,");
break;
case RESOURCE_BARRIER_STAGE_COLORANDCOMPUTE:
mesa_log_stream_printf(stream, "ColorAndCompute,");
break;
case RESOURCE_BARRIER_STAGE_GEOMETRYANDCOMPUTE:
mesa_log_stream_printf(stream, "GeometryAndCompute,");
break;
default:
UNREACHABLE("Unknown barrier stage");
}
}
}

ALWAYS_INLINE static void
anv_dump_rsc_barrier_body(const struct GENX(RESOURCE_BARRIER_BODY) body) {
if (!INTEL_DEBUG(DEBUG_PIPE_CONTROL))
return;

struct log_stream *stream = mesa_log_streami();

mesa_log_stream_printf(
stream, "rb : Type=%s ",
body.BarrierType == RESOURCE_BARRIER_TYPE_IMMEDIATE ? "Immediate" :
body.BarrierType == RESOURCE_BARRIER_TYPE_SIGNAL ? "Signal" :
body.BarrierType == RESOURCE_BARRIER_TYPE_WAIT ? "Wait" : "Unknown");

if (body.SignalStage) {
mesa_log_stream_printf(stream, " Signal=");
anv_dump_rsc_stage(stream, body.SignalStage);
}

if (body.WaitStage) {
mesa_log_stream_printf(stream, " Wait=");
anv_dump_rsc_stage(stream, body.WaitStage);
}

mesa_log_stream_printf(stream, " Flags=");
if (body.L1DataportCacheInvalidate)
mesa_log_stream_printf(stream, "+l1-dc-inval,");
if (body.DepthCache)
mesa_log_stream_printf(stream, "+ds-flush,");
if (body.ColorCache)
mesa_log_stream_printf(stream, "+rt-flush,");
if (body.L1DataportUAVFlush)
mesa_log_stream_printf(stream, "+l1-dp-uav-flush,");
if (body.TextureRO)
mesa_log_stream_printf(stream, "+tex-inval,");
if (body.StateRO)
mesa_log_stream_printf(stream, "+state-inval,");
if (body.VFRO)
mesa_log_stream_printf(stream, "+vf-inval,");
if (body.AMFS)
mesa_log_stream_printf(stream, "+amfs,");
if (body.ConstantCache)
mesa_log_stream_printf(stream, "+const-inval,");

mesa_log_stream_printf(stream, "\n");

mesa_log_stream_destroy(stream);
}

static inline enum intel_ds_stages
resource_barrier_stage_to_ds(uint8_t stages)
{
enum intel_ds_stages ret = 0;

u_foreach_bit(b, stages) {
switch (BITFIELD_BIT(b)) {
case RESOURCE_BARRIER_STAGE_TOP:
ret |= INTEL_DS_STAGES_TOP_BIT;
break;
case RESOURCE_BARRIER_STAGE_GEOM:
ret |= INTEL_DS_STAGES_GEOM_BIT;
break;
case RESOURCE_BARRIER_STAGE_RASTER:
ret |= INTEL_DS_STAGES_RASTER_BIT;
break;
case RESOURCE_BARRIER_STAGE_DEPTH:
ret |= INTEL_DS_STAGES_DEPTH_BIT;
break;
case RESOURCE_BARRIER_STAGE_PIXEL:
ret |= INTEL_DS_STAGES_PIXEL_BIT;
break;
case RESOURCE_BARRIER_STAGE_COLOR:
ret |= INTEL_DS_STAGES_COLOR_BIT;
break;
case RESOURCE_BARRIER_STAGE_GPGPU:
ret |= INTEL_DS_STAGES_GPGPU_BIT;
break;
default:
UNREACHABLE("invalid barrier stage");
}
}
return ret;
}

ALWAYS_INLINE static enum GENX(RESOURCE_BARRIER_STAGE)
resource_barrier_signal_stage(enum intel_engine_class engine_class,
const VkPipelineStageFlags2 vk_stages)
{
enum GENX(RESOURCE_BARRIER_STAGE) hw_stages = 0;

/* BSpec 56054, Signal Stage:
* "Hardware only drains to the following stages:
* Geometry(Streamout Stage)
* Color(End of 3D pipeline)
* Compute(End of compute pipeline)"
*
* It seems setting other stages is not causing issues though.
*/
if (engine_class == INTEL_ENGINE_CLASS_RENDER) {
if (vk_stages & (VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT_KHR |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT_KHR |
VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR |
VK_PIPELINE_STAGE_2_COPY_BIT_KHR |
VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR |
VK_PIPELINE_STAGE_2_BLIT_BIT_KHR |
VK_PIPELINE_STAGE_2_CLEAR_BIT_KHR))
hw_stages |= RESOURCE_BARRIER_STAGE_COLOR;
else if (vk_stages & (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR))
hw_stages |= RESOURCE_BARRIER_STAGE_PIXEL;
else if (vk_stages & (VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT_KHR))
hw_stages |= RESOURCE_BARRIER_STAGE_DEPTH;
else if (vk_stages & (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT_KHR |
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT_KHR |
VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT))
hw_stages |= RESOURCE_BARRIER_STAGE_GEOM;
else if (vk_stages & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR |
VK_PIPELINE_STAGE_2_HOST_BIT_KHR |
VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT))
hw_stages |= RESOURCE_BARRIER_STAGE_GEOM;
}
if (vk_stages & (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_MICROMAP_BUILD_BIT_EXT))
hw_stages |= RESOURCE_BARRIER_STAGE_GPGPU;

/* Transfer is either done using the color output or a compute shader on
* RCS and only a compute shader on CCS.
*/
if (vk_stages & (VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT_KHR |
VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR |
VK_PIPELINE_STAGE_2_COPY_BIT_KHR |
VK_PIPELINE_STAGE_2_CLEAR_BIT_KHR)) {
if (engine_class == INTEL_ENGINE_CLASS_RENDER) {
hw_stages |= RESOURCE_BARRIER_STAGE_COLOR |
RESOURCE_BARRIER_STAGE_GPGPU;
} else {
hw_stages |= RESOURCE_BARRIER_STAGE_GPGPU;
}
}

/* BSpec 56054, Signal Stage:
* "If the driver receives a request for multiple stages in the 3D pipeline,
* it only specifies the last stage."
*/
if ((hw_stages & RESOURCE_BARRIER_STAGE_GEOM) &&
(hw_stages & RESOURCE_BARRIER_STAGE_COLOR))
hw_stages &= ~RESOURCE_BARRIER_STAGE_GEOM;

return hw_stages;
}

ALWAYS_INLINE static enum GENX(RESOURCE_BARRIER_STAGE)
resource_barrier_wait_stage(enum intel_engine_class engine_class,
const VkPipelineStageFlags2 vk_stages)
{
enum GENX(RESOURCE_BARRIER_STAGE) hw_stage = 0;

/* BSpec 56054, Wait Stage:
* "Hardware is only able to stall at the following stages:
* Top of the pipe (command parser)
* Raster (Before rasterization of objects)
* Pixel Shader (Before dispatch)
*
* Programming different stages will cause the hardware to select the
* stage before and nearest to the programmed stages"
*
* That is why we use TOP for compute stages.
*/
if (vk_stages & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR |
VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT |
VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT_KHR |
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT_KHR |
VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT_KHR |
VK_PIPELINE_STAGE_2_HOST_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT_KHR |
VK_PIPELINE_STAGE_2_COPY_BIT_KHR |
VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT_KHR |
VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT |
VK_PIPELINE_STAGE_2_MICROMAP_BUILD_BIT_EXT))
hw_stage = RESOURCE_BARRIER_STAGE_TOP;
else if (vk_stages & (VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT_KHR |
VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR))
hw_stage = RESOURCE_BARRIER_STAGE_RASTER;
else if (vk_stages & (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT_KHR |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT_KHR |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR |
VK_PIPELINE_STAGE_2_RESOLVE_BIT_KHR |
VK_PIPELINE_STAGE_2_BLIT_BIT_KHR |
VK_PIPELINE_STAGE_2_CLEAR_BIT_KHR))
hw_stage = RESOURCE_BARRIER_STAGE_PIXEL;

return hw_stage;
}

ALWAYS_INLINE static bool
can_use_resource_barrier(const struct intel_device_info *devinfo,
enum intel_engine_class engine_class,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages,
enum anv_pipe_bits bits,
struct anv_address signal_addr,
struct anv_address wait_addr)
{
if (INTEL_DEBUG(DEBUG_NO_RESOURCE_BARRIER))
return false;

if (engine_class != INTEL_ENGINE_CLASS_RENDER &&
engine_class != INTEL_ENGINE_CLASS_COMPUTE)
return false;

/* Wa_18039014283:
*
* RESOURCE_BARRIER instructions with a Type=Signal, SignalStage=GPGPU are
* not functional. Since the main use case for this is VkEvent and VkEvent
* might not have exactly matching informations on both signal/wait sides
* (see
* https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdWaitEvents.html),
* this is somewhat unusable.
*
* We're also seeing other problems with this, for example with
* dEQP-VK.synchronization2.op.single_queue.event.write_blit_image_read_copy_image_to_buffer.image_128_r32_uint
* So HW might be more broken than expected.
*/
if (intel_needs_workaround(devinfo, 18039014283) &&
(!anv_address_is_null(signal_addr) ||
!anv_address_is_null(wait_addr)))
return false;

/* The HW doesn't support signaling from the top of pipeline */
enum GENX(RESOURCE_BARRIER_STAGE) signal_stage =
resource_barrier_signal_stage(engine_class, src_stages);
if (signal_stage == RESOURCE_BARRIER_STAGE_NONE ||
signal_stage == RESOURCE_BARRIER_STAGE_TOP)
return false;

/* L3 flushes are also not supported with RESOURCE_BARRIER */
if (bits & (ANV_PIPE_TILE_CACHE_FLUSH_BIT |
ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_L3_FABRIC_FLUSH_BIT |
ANV_PIPE_CS_STALL_BIT))
return false;

return true;
}

ALWAYS_INLINE static void
emit_resource_barrier(struct anv_batch *batch,
const struct intel_device_info *devinfo,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages,
enum anv_pipe_bits bits,
struct anv_address signal_addr,
struct anv_address wait_addr)
{
trace_intel_begin_barrier(batch->trace);

enum GENX(RESOURCE_BARRIER_STAGE) signal_stages =
resource_barrier_signal_stage(batch->engine_class, src_stages);
enum GENX(RESOURCE_BARRIER_STAGE) wait_stages =
resource_barrier_wait_stage(batch->engine_class, dst_stages);

enum GENX(RESOURCE_BARRIER_TYPE) barrier_type;
struct anv_address barrier_addr;
if (anv_address_is_null(signal_addr) &&
anv_address_is_null(wait_addr)) {
barrier_type = RESOURCE_BARRIER_TYPE_IMMEDIATE;
barrier_addr = ANV_NULL_ADDRESS;
} else if (!anv_address_is_null(signal_addr)) {
barrier_type = RESOURCE_BARRIER_TYPE_SIGNAL;
barrier_addr = signal_addr;
} else {
assert(!anv_address_is_null(wait_addr));
barrier_type = RESOURCE_BARRIER_TYPE_WAIT;
barrier_addr = wait_addr;
}

#if INTEL_WA_18037648410_GFX_VER
/* "When setting VF invalidate as a flush bit in RESOURCE_BARRIER, ensure
* that Geometry Stage bit is set in Signal field."
*/
if (intel_needs_workaround(devinfo, 18037648410) &&
(bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT) &&
signal_stages == RESOURCE_BARRIER_STAGE_GPGPU) {
signal_stages |= RESOURCE_BARRIER_STAGE_GEOM;
}
#endif

if (bits & ANV_PIPE_RT_BTI_CHANGE) {
/* We used to deal with RT BTI changes with a PIPE_CONTROL with the
* following flags:
* - RenderTargetCacheFlushEnable
* - StallAtPixelScoreboard
*
* With the new RESOURCE_BARRIER instruction, there is a problem in HW
* if you do something like this:
* Draw BT0=surfaceA
* Type=Immediate Signal=Color Wait=Top Flags=Color
* Draw BT0=surfaceB
*
* The new BTI0 is somehow not updated in the state cache, so the second
* draw color writes are going either to the previous surface or maybe
* /dev/null?
*
* The Windows drivers appear to not experience this because they're
* setting COMMON_SLICE_CHICKEN3:StateCachePerfFixDisabled=true.
*
* We cannot enable this unfortunately because we're still relying
* pretty heavily on the binding table and toggling that bit is big
* performance regression on multiple benchmarks (up to 25%).
*
* So when ANV_PIPE_RT_BTI_CHANGE is set, emit a RT flush + state cache
* invalidation (which seems to correctly invalidate the RCC).
*/
bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
}

anv_batch_emit(batch, GENX(RESOURCE_BARRIER), barrier) {
barrier.ResourceBarrierBody.BarrierType = barrier_type;
barrier.ResourceBarrierBody.BarrierIDAddress = barrier_addr;
barrier.ResourceBarrierBody.SignalStage = signal_stages;
barrier.ResourceBarrierBody.WaitStage = wait_stages;

/*
* ----------------------------------------------------------------------------------------------------------------
* | STATE_COMPUTE_MODE | RESOURCE_BARRIER | HW behavior |
* | UAV Coherency Mode | L1 DataPort UAV Flush | L1 DataPort Cache Invalidate | LSC Flush | LSC Inv | HDC Flush |
* ----------------------------------------------------------------------------------------------------------------
* | Drain Dataport Mode | 0 | 0 | 0 | 0 | 0 |
* | Drain Dataport Mode | 0 | 1 | 0 | 1 | 1 |
* | Drain Dataport Mode | 1 | 0 | 0 | 0 | 1 |
* | Drain Dataport Mode | 1 | 1 | 0 | 1 | 1 |
* | Flush Dataport L1 | 0 | 0 | 0 | 0 | 0 |
* | Flush Dataport L1 | 0 | 1 | 0 | 1 | 1 |
* | Flush Dataport L1 | 1 | 0 | 0 | 0 | 1 |
* | Flush Dataport L1 | 1 | 1 | 1 | 1 | 1 |
* ----------------------------------------------------------------------------------------------------------------
*/

/* Flushes */
barrier.ResourceBarrierBody.L1DataportCacheInvalidate =
bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
barrier.ResourceBarrierBody.L1DataportUAVFlush =
bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
barrier.ResourceBarrierBody.DepthCache =
bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
barrier.ResourceBarrierBody.ColorCache =
bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;

/* Invalidations */
barrier.ResourceBarrierBody.VFRO =
bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
barrier.ResourceBarrierBody.TextureRO =
bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
barrier.ResourceBarrierBody.StateRO =
bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
barrier.ResourceBarrierBody.ConstantCache =
bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;

anv_dump_rsc_barrier_body(barrier.ResourceBarrierBody);
}

trace_intel_end_barrier(batch->trace,
barrier_type, signal_stages, wait_stages,
resource_barrier_stage_to_ds,
bits, anv_pipe_flush_bit_to_ds_stall_flag,
batch->pc_reasons[0],
batch->pc_reasons[1],
batch->pc_reasons[2],
batch->pc_reasons[3]);
batch->pc_reasons[0] = NULL;
batch->pc_reasons[1] = NULL;
batch->pc_reasons[2] = NULL;
batch->pc_reasons[3] = NULL;
batch->pc_reasons_count = 0;
}

#endif /* GFX_VER >= 20 */



ALWAYS_INLINE static enum anv_pipe_bits
genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
struct anv_device *device,
uint32_t current_pipeline,
VkPipelineStageFlags2 src_stages,
VkPipelineStageFlags2 dst_stages,
enum anv_pipe_bits bits,
struct anv_address signal_addr,
struct anv_address wait_addr,
enum anv_pipe_bits *emitted_flush_bits)
{
#if GFX_VER >= 12
#if GFX_VER >= 20
if (can_use_resource_barrier(device->info, batch->engine_class,
src_stages, dst_stages, bits,
signal_addr, wait_addr)) {
emit_resource_barrier(batch, device->info,
src_stages, dst_stages, bits,
signal_addr, wait_addr);
*emitted_flush_bits = 0;
return 0;
}
#endif

/* What stage require a stall at pixel scoreboard */
VkPipelineStageFlags2 pb_stall_stages =
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT;
if (batch->engine_class == INTEL_ENGINE_CLASS_RENDER) {
/* On a render queue, the following stages can also use a pixel shader.
*/
pb_stall_stages |=
VK_PIPELINE_STAGE_2_TRANSFER_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_CLEAR_BIT;
}
VkPipelineStageFlags2 cs_stall_stages =
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR;
if (batch->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
/* On a compute queue, the following stages can also use a compute
* shader.
*/
cs_stall_stages |=
VK_PIPELINE_STAGE_2_TRANSFER_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_CLEAR_BIT;
} else if (batch->engine_class == INTEL_ENGINE_CLASS_RENDER &&
current_pipeline == GPGPU) {
/* In GPGPU mode, the render queue can also use a compute shader for
* transfer operations.
*/
cs_stall_stages |= VK_PIPELINE_STAGE_2_TRANSFER_BIT;
}

/* Prior to Gfx20, we can restrict pb-stall/cs-stall to some pipeline
* modes. Gfx20 doesn't do pipeline switches so we have to assume the worse
* case.
*/
const bool needs_pb_stall =
batch->engine_class == INTEL_ENGINE_CLASS_RENDER &&
#if GFX_VER < 20
current_pipeline == _3D &&
#endif
(dst_stages & ~pb_stall_stages) == 0 &&
(dst_stages & pb_stall_stages);
if (needs_pb_stall) {
bits |= GFX_VERx10 >= 125 ?
ANV_PIPE_PSS_STALL_SYNC_BIT :
ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
}
const bool needs_cs_stall =
(batch->engine_class == INTEL_ENGINE_CLASS_RENDER ||
batch->engine_class == INTEL_ENGINE_CLASS_COMPUTE) &&
(src_stages & cs_stall_stages);
if (needs_cs_stall)
bits |= ANV_PIPE_CS_STALL_BIT;

if (bits & ANV_PIPE_RT_BTI_CHANGE) {
bits &= ~ANV_PIPE_RT_BTI_CHANGE;
bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
}

#if GFX_VER >= 12 && GFX_VER < 20
/* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
*
* "SW must follow below programming restrictions when programming
@@ -1782,7 +2343,7 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
ANV_PIPE_END_OF_PIPE_SYNC_BIT);

uint32_t sync_op = NoWrite;
struct anv_address addr = ANV_NULL_ADDRESS;
struct anv_address addr = signal_addr;

/* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
*
@@ -1812,12 +2373,15 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
flush_bits |= ANV_PIPE_CS_STALL_BIT;
sync_op = WriteImmediateData;
addr = device->workaround_address;
if (anv_address_is_null(signal_addr))
addr = device->workaround_address;
}

/* Flush PC. */
emit_pipe_control(batch, device->info, current_pipeline,
sync_op, addr, 0, flush_bits);
sync_op, addr,
anv_address_is_null(addr) ? 0 : 1,
flush_bits);

/* If the caller wants to know what flushes have been emitted,
* provide the bits based off the PIPE_CONTROL programmed bits.
@@ -1826,7 +2390,8 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
*emitted_flush_bits = flush_bits;

bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
ANV_PIPE_END_OF_PIPE_SYNC_BIT);
ANV_PIPE_END_OF_PIPE_SYNC_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_FORCE_FLUSH_L3_BIT);
}

if (bits & ANV_PIPE_INVALIDATE_BITS) {
@@ -1853,7 +2418,7 @@ genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
bits &= ~ANV_PIPE_INVALIDATE_BITS;
}

#if GFX_VER >= 12
#if GFX_VER >= 12 && GFX_VER < 20
bits |= defer_bits;
#endif

@@ -1872,6 +2437,8 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
cmd_buffer->state.pending_rhwo_optimization_enabled;
if (rhwo_opt_change) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
"change RHWO optimization");
@@ -1880,9 +2447,16 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)

enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;

/* Consume the stages here */
VkPipelineStageFlags2 src_stages = cmd_buffer->state.pending_src_stages;
VkPipelineStageFlags2 dst_stages = cmd_buffer->state.pending_dst_stages;
cmd_buffer->state.pending_src_stages = 0;
cmd_buffer->state.pending_dst_stages = 0;


if (unlikely(cmd_buffer->device->physical->always_flush_cache))
bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
else if (bits == 0)
bits |= ANV_PIPE_BARRIER_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
else if (bits == 0 && src_stages == 0 && dst_stages == 0)
return;

if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
@@ -1899,7 +2473,7 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
}

genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
cmd_buffer->queue_family->engine_class, bits);
cmd_buffer->batch.engine_class, bits);
}
bits &= ~ANV_PIPE_INVALIDATE_BITS;
}
@@ -1924,7 +2498,8 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
cmd_buffer->device,
cmd_buffer->state.current_pipeline,
bits,
src_stages, dst_stages, bits,
ANV_NULL_ADDRESS, ANV_NULL_ADDRESS,
&emitted_bits);
anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);

@@ -2587,7 +3162,12 @@ emit_pipe_control(struct anv_batch *batch,
pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
#endif
pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
#if GFX_VER >= 20
pipe.ForceDeviceCoherency = bits & (ANV_PIPE_TILE_CACHE_FLUSH_BIT |
ANV_PIPE_DATA_CACHE_FLUSH_BIT);
#else
pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
#endif
pipe.RenderTargetCacheFlushEnable =
bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;

@@ -2619,6 +3199,7 @@ emit_pipe_control(struct anv_batch *batch,
pipe.InstructionCacheInvalidateEnable =
bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;

assert(!anv_address_is_null(address) || post_sync_op == NoWrite);
pipe.PostSyncOperation = post_sync_op;
pipe.Address = address;
pipe.DestinationAddressType = DAT_PPGTT;
@@ -2892,6 +3473,8 @@ genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *cmd_buffer,
if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
cmd_buffer->device->info->has_aux_map) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
"new cmd buffer with aux-tt");
}
@@ -2935,7 +3518,13 @@ add_pending_pipe_bits_for_color_aux_op(struct anv_cmd_buffer *cmd_buffer,
assert(ret < sizeof(flush_reason));
}

anv_add_pending_pipe_bits(cmd_buffer, pipe_bits, flush_reason);
anv_add_pending_pipe_bits(cmd_buffer,
aux_op_clears(next_aux_op) ?
VK_PIPELINE_STAGE_2_NONE :
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
aux_op_clears(next_aux_op) ?
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT : 0,
pipe_bits, flush_reason);
}

void
@@ -2965,9 +3554,7 @@ genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer,
* clear pass, to ensure correct ordering between pixels.
*/
add_pending_pipe_bits_for_color_aux_op(
cmd_buffer, next_aux_op,
ANV_PIPE_PSS_STALL_SYNC_BIT |
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
cmd_buffer, next_aux_op, ANV_PIPE_RT_BTI_CHANGE);

#elif GFX_VERx10 == 125
/* From the ACM Bspec 47704 (r52663), "Render Target Fast Clear":
@@ -2989,7 +3576,6 @@ genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer,
*/
add_pending_pipe_bits_for_color_aux_op(
cmd_buffer, next_aux_op,
ANV_PIPE_PSS_STALL_SYNC_BIT |
ANV_PIPE_TILE_CACHE_FLUSH_BIT |
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
@@ -3059,9 +3645,7 @@ genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer,
* RT flush = 1
*/
add_pending_pipe_bits_for_color_aux_op(
cmd_buffer, next_aux_op,
ANV_PIPE_PSS_STALL_SYNC_BIT |
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
cmd_buffer, next_aux_op, ANV_PIPE_RT_BTI_CHANGE);

#elif GFX_VERx10 == 120
/* From the TGL PRM Vol. 9, "Color Fast Clear Synchronization":
@@ -3146,6 +3730,8 @@ genX(cmd_buffer_update_color_aux_op)(struct anv_cmd_buffer *cmd_buffer,
* cache invalidation with the texture cache invalidation done on gfx12.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
"Invalidate for new clear color");
}
@@ -3180,7 +3766,9 @@ genX(cmd_buffer_set_protected_memory)(struct anv_cmd_buffer *cmd_buffer,
}
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
pc.PipeControlFlushEnable = true;
#if GFX_VER < 20
pc.DCFlushEnable = true;
#endif
pc.RenderTargetCacheFlushEnable = true;
pc.CommandStreamerStallEnable = true;
if (enabled)
@@ -3267,6 +3855,8 @@ genX(BeginCommandBuffer)(
if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
cmd_buffer->device->info->has_aux_map) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
"new cmd buffer with aux-tt");
}
@@ -3294,6 +3884,8 @@ genX(BeginCommandBuffer)(
if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
cmd_buffer->device->info->has_aux_map) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
"new cmd buffer with aux-tt");
}
@@ -3454,6 +4046,8 @@ end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
*/
if (cmd_buffer->state.queries.clear_bits) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
"query clear flush prior command buffer end");
}
@@ -3563,6 +4157,8 @@ genX(CmdExecuteCommands)(
*/
if (container->state.queries.clear_bits) {
anv_add_pending_pipe_bits(container,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
"query clear flush prior to secondary buffer");
}
@@ -3630,12 +4226,7 @@ genX(CmdExecuteCommands)(
src_state.alloc_size);
}
}
genX(emit_so_memcpy_fini)(&memcpy_state);

anv_add_pending_pipe_bits(container,
ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
"Wait for primary->secondary RP surface state copies");
genX(cmd_buffer_apply_pipe_flushes)(container);
genX(emit_so_memcpy_fini)(&memcpy_state, true);

if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
genX(cmd_buffer_set_protected_memory)(container, true);
@@ -3715,6 +4306,8 @@ genX(CmdExecuteCommands)(
*/
if (GFX_VER == 9) {
anv_add_pending_pipe_bits(container,
VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
"Secondary cmd buffer not tracked in VF cache");
}
@@ -3788,7 +4381,7 @@ genX(CmdExecuteCommands)(
&memcpy_state,
anv_device_utrace_emit_gfx_copy_buffer);
}
genX(emit_so_memcpy_fini)(&memcpy_state);
genX(emit_so_memcpy_fini)(&memcpy_state, true);

trace_intel_end_trace_copy(&container->trace, num_traces);

@@ -3802,6 +4395,7 @@ anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
VkAccessFlags2 flags,
VkAccessFlagBits3KHR flags3)
{
struct anv_device *device = cmd_buffer->device;
enum anv_pipe_bits pipe_bits = 0;

u_foreach_bit64(b, flags) {
@@ -3860,9 +4454,13 @@ anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
break;
case VK_ACCESS_2_MEMORY_WRITE_BIT:
/* We're transitioning a buffer for generic write operations. Flush
* all the caches.
* all the caches. On Gfx20+ we can limit ourself to L1/L2 flushing
* because all the fixed functions are L3 coherent (CS, streamout).
*/
pipe_bits |= ANV_PIPE_BARRIER_FLUSH_BITS;
if (device->info->ver < 20)
pipe_bits |= ANV_PIPE_BARRIER_FLUSH_BITS;
else
pipe_bits |= ANV_PIPE_L1_L2_BARRIER_FLUSH_BITS;
break;
case VK_ACCESS_2_HOST_WRITE_BIT:
/* We're transitioning a buffer for access by CPU. Invalidate
@@ -3920,11 +4518,13 @@ anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
* an A64 message, so we need to invalidate constant cache.
*/
pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
/* Tile & Data cache flush needed For Cmd*Indirect* commands since
* command streamer is not L3 coherent.
/* Prior to Gfx20, Tile & Data cache flush needed For Cmd*Indirect*
* commands since command streamer is not L3 coherent.
*/
pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT |
ANV_PIPE_DATA_CACHE_FLUSH_BIT;
if (device->info->ver < 20) {
pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_TILE_CACHE_FLUSH_BIT;
}
break;
case VK_ACCESS_2_INDEX_READ_BIT:
case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
@@ -3989,8 +4589,13 @@ anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
* any in-flight flush operations have completed.
*/
pipe_bits |= ANV_PIPE_CS_STALL_BIT;
pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
/* Prior to Gfx20, CS is not L3 coherent, so make the data available
* for it by flushing L3.
*/
if (device->info->ver < 20) {
pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
}
break;
case VK_ACCESS_2_HOST_READ_BIT:
/* We're transitioning a buffer that was written by CPU. Flush
@@ -4004,8 +4609,10 @@ anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
* tile cache flush to make sure any previous write is not going to
* create WaW hazards.
*/
pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
if (device->info->ver < 20) {
pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
}
break;
case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
case VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR:
@@ -4113,7 +4720,9 @@ mask_is_transfer_write(const VkAccessFlags2 access)
static void
cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
uint32_t n_dep_infos,
const VkDependencyInfo *dep_infos)
const VkDependencyInfo *dep_infos,
struct anv_address signal_addr,
uint64_t signal_value)
{
assert(anv_cmd_buffer_is_video_queue(cmd_buffer));

@@ -4193,7 +4802,7 @@ cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
break;
}

if (flush_ccs || flush_llc) {
if (flush_ccs || flush_llc || !anv_address_is_null(signal_addr)) {
anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
#if GFX_VERx10 >= 125
fd.FlushCCS = flush_ccs;
@@ -4205,6 +4814,12 @@ cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
*/
fd.FlushLLC = flush_llc;
#endif

if (!anv_address_is_null(signal_addr)) {
fd.PostSyncOperation = WriteImmediateData;
fd.Address = signal_addr;
fd.ImmediateData = signal_value;
}
}
}
}
@@ -4212,7 +4827,9 @@ cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
static void
cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
uint32_t n_dep_infos,
const VkDependencyInfo *dep_infos)
const VkDependencyInfo *dep_infos,
struct anv_address signal_addr,
uint64_t signal_value)
{
#if GFX_VERx10 >= 125
assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
@@ -4305,7 +4922,7 @@ cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
break;
}

if (flush_ccs || flush_llc) {
if (flush_ccs || flush_llc || !anv_address_is_null(signal_addr)) {
/* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
if (INTEL_WA_16018063123_GFX_VER) {
genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
@@ -4314,6 +4931,12 @@ cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
fd.FlushCCS = flush_ccs;
fd.FlushLLC = flush_llc;

if (!anv_address_is_null(signal_addr)) {
fd.PostSyncOperation = WriteImmediateData;
fd.Address = signal_addr;
fd.ImmediateData = signal_value;
}
}
}
#endif
@@ -4620,75 +5243,16 @@ cmd_buffer_accumulate_barrier_bits(struct anv_cmd_buffer *cmd_buffer,
anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags, src_flags3) |
anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags, dst_flags3);

/* What stage require a stall at pixel scoreboard */
VkPipelineStageFlags2 pb_stall_stages =
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT;
if (anv_cmd_buffer_is_render_queue(cmd_buffer)) {
/* On a render queue, the following stages can also use a pixel shader.
*/
pb_stall_stages |=
VK_PIPELINE_STAGE_2_TRANSFER_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_CLEAR_BIT;
}
VkPipelineStageFlags2 cs_stall_stages =
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR |
VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR;
if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
/* On a compute queue, the following stages can also use a compute
* shader.
*/
cs_stall_stages |=
VK_PIPELINE_STAGE_2_TRANSFER_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_CLEAR_BIT;
} else if (anv_cmd_buffer_is_render_queue(cmd_buffer) &&
cmd_buffer->state.current_pipeline == GPGPU) {
/* In GPGPU mode, the render queue can also use a compute shader for
* transfer operations.
/* Copies from query pools are executed with a shader writing through the
* dataport.
*/
cs_stall_stages |= VK_PIPELINE_STAGE_2_TRANSFER_BIT;
if (flush_query_copies) {
bits |= (GFX_VER >= 12 ?
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
}

/* Prior to Gfx20, we can restrict pb-stall/cs-stall to some pipeline
* modes. Gfx20 doesn't do pipeline switches so we have to assume the worse
* case.
*
* To use a PB-stall we need both destination stages to be contained to the
* fragment shader stages. That way the HW can hold the fragment shader
* dispatch until the synchronization operation happened.
*/
const bool needs_pb_stall =
anv_cmd_buffer_is_render_queue(cmd_buffer) &&
#if GFX_VER < 20
cmd_buffer->state.current_pipeline == _3D &&
#endif
(dst_stages & ~pb_stall_stages) == 0 &&
(dst_stages & pb_stall_stages);
if (needs_pb_stall) {
bits |= GFX_VERx10 >= 125 ?
ANV_PIPE_PSS_STALL_SYNC_BIT :
ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
}
const bool needs_cs_stall =
anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer) &&
(dst_stages & cs_stall_stages);
if (needs_cs_stall)
bits |= ANV_PIPE_CS_STALL_BIT;
if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
genX(cmd_buffer_flush_generated_draws)(cmd_buffer);

#if GFX_VER < 20
/* Our HW implementation of the sparse feature prior to Xe2 lives in the
@@ -4712,6 +5276,9 @@ cmd_buffer_accumulate_barrier_bits(struct anv_cmd_buffer *cmd_buffer,
* dataport.
*/
if (flush_query_copies) {
src_stages |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
dst_stages |= VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
bits |= (GFX_VER >= 12 ?
ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
}
@@ -4732,11 +5299,13 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
{
switch (cmd_buffer->batch.engine_class) {
case INTEL_ENGINE_CLASS_VIDEO:
cmd_buffer_barrier_video(cmd_buffer, n_dep_infos, dep_infos);
cmd_buffer_barrier_video(cmd_buffer, n_dep_infos, dep_infos,
ANV_NULL_ADDRESS, 0);
break;

case INTEL_ENGINE_CLASS_COPY:
cmd_buffer_barrier_blitter(cmd_buffer, n_dep_infos, dep_infos);
cmd_buffer_barrier_blitter(cmd_buffer, n_dep_infos, dep_infos,
ANV_NULL_ADDRESS, 0);
break;

case INTEL_ENGINE_CLASS_RENDER:
@@ -4746,7 +5315,7 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
cmd_buffer_accumulate_barrier_bits(cmd_buffer, n_dep_infos, dep_infos,
&src_stages, &dst_stages, &bits);

anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
anv_add_pending_pipe_bits(cmd_buffer, src_stages, dst_stages, bits, reason);
break;
}

@@ -4842,6 +5411,7 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
if (cmd_buffer->state.current_pipeline == pipeline)
return;

#if GFX_VER < 20
#if GFX_VER == 9
/* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
*
@@ -4877,6 +5447,8 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
if (cmd_buffer->state.current_pipeline == _3D &&
cmd_buffer->state.queries.clear_bits) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
"query clear flush prior to GPGPU");
}
@@ -4943,7 +5515,10 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
intel_needs_workaround(cmd_buffer->device->info, 16013063087))
bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;

anv_add_pending_pipe_bits(cmd_buffer, bits,
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
bits,
pipeline == _3D ?
"flush/invalidate PIPELINE_SELECT 3D" :
"flush/invalidate PIPELINE_SELECT GPGPU");
@@ -5012,6 +5587,7 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
if (pipeline == GPGPU)
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
#endif
#endif /* GFX_VER < 20 */
cmd_buffer->state.current_pipeline = pipeline;
}

@@ -5053,6 +5629,8 @@ genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
* settings while we change the registers.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
ANV_PIPE_DEPTH_STALL_BIT |
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
@@ -5128,6 +5706,8 @@ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer
vb_address,
vb_size)) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
"vb > 32b range");
@@ -5237,6 +5817,8 @@ genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
if (cmd_buffer->state.current_hash_scale != scale &&
(width > min_size[idx][0] || height > min_size[idx][1])) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
"change pixel hash mode");
@@ -5945,9 +6527,10 @@ void genX(CmdBeginRendering)(
* in the case that there are no RTs (depth-only rendering), though.
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
"change RT");
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
ANV_PIPE_RT_BTI_CHANGE,
"change RT");
}
#endif

@@ -6036,6 +6619,8 @@ void genX(CmdEndRendering2KHR)(
* sampler when we blit to the single-sampled resolve target.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
"MSAA resolve");
@@ -6052,9 +6637,11 @@ void genX(CmdEndRendering2KHR)(
* sampler when we blit to the single-sampled resolve target.
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
"MSAA resolve");
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
"MSAA resolve");
}

#if GFX_VER < 20
@@ -6083,7 +6670,10 @@ void genX(CmdEndRendering2KHR)(
* sure unbound regions read 0, as residencyNonResidentStrict
* mandates.
*/
anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_TILE_CACHE_FLUSH_BIT,
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_TILE_CACHE_FLUSH_BIT,
"sparse MSAA resolve");
}
#endif
@@ -6234,37 +6824,46 @@ void genX(CmdSetEvent2)(

switch (cmd_buffer->batch.engine_class) {
case INTEL_ENGINE_CLASS_VIDEO:
cmd_buffer_barrier_video(cmd_buffer, 1, pDependencyInfo,
anv_state_pool_state_address(
&cmd_buffer->device->dynamic_state_pool,
event->state), 1);
break;

case INTEL_ENGINE_CLASS_COPY:
anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
flush.PostSyncOperation = WriteImmediateData;
flush.Address = anv_state_pool_state_address(
&cmd_buffer->device->dynamic_state_pool,
event->state);
flush.ImmediateData = VK_EVENT_SET;
}
cmd_buffer_barrier_blitter(cmd_buffer, 1, pDependencyInfo,
anv_state_pool_state_address(
&cmd_buffer->device->dynamic_state_pool,
event->state), 1);
break;

case INTEL_ENGINE_CLASS_RENDER:
case INTEL_ENGINE_CLASS_COMPUTE: {
VkPipelineStageFlags2 src_stages =
vk_collect_dependency_info_src_stages(pDependencyInfo);
VkPipelineStageFlags2 src_stages, dst_stages;
enum anv_pipe_bits bits = 0;
cmd_buffer_accumulate_barrier_bits(cmd_buffer, 1, pDependencyInfo,
&src_stages, &dst_stages, &bits);

cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
/* Only consider the flush bits, the wait part will do the invalidate.
*/
bits &= ANV_PIPE_FLUSH_BITS;

enum anv_pipe_bits pc_bits = 0;
if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
pc_bits |= ANV_PIPE_CS_STALL_BIT;
}
/* To have the signal_addr written */
bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;

genX(batch_emit_pipe_control_write)
(&cmd_buffer->batch, cmd_buffer->device->info,
cmd_buffer->state.current_pipeline, WriteImmediateData,
anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
/* Need main memory coherency */
if ((event->flags & VK_EVENT_CREATE_DEVICE_ONLY_BIT) == 0)
bits |= ANV_PIPE_END_OF_PIPE_SYNC_FORCE_FLUSH_L3_BIT;

genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
cmd_buffer->device,
cmd_buffer->state.current_pipeline,
src_stages, dst_stages, bits,
anv_state_pool_state_address(
&cmd_buffer->device->dynamic_state_pool,
event->state),
VK_EVENT_SET, pc_bits,
"vkCmdSetEvent2");
ANV_NULL_ADDRESS,
NULL);
break;
}

@@ -6281,6 +6880,10 @@ void genX(CmdResetEvent2)(
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_event, event, _event);

/* Write a 0 as reset value, for PIPE_CONTROL/MI_FLUSH_DW we can write 1 as
* signal value. RESOURCE_BARRIER can write a non 0 value.
*/

switch (cmd_buffer->batch.engine_class) {
case INTEL_ENGINE_CLASS_VIDEO:
case INTEL_ENGINE_CLASS_COPY:
@@ -6289,13 +6892,16 @@ void genX(CmdResetEvent2)(
flush.Address = anv_state_pool_state_address(
&cmd_buffer->device->dynamic_state_pool,
event->state);
flush.ImmediateData = VK_EVENT_RESET;
flush.ImmediateData = 0;
}
break;

case INTEL_ENGINE_CLASS_RENDER:
case INTEL_ENGINE_CLASS_COMPUTE: {
cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
anv_add_pending_pipe_bits(cmd_buffer,
stageMask, 0,
ANV_PIPE_POST_SYNC_BIT,
"event reset");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);

enum anv_pipe_bits pc_bits = 0;
@@ -6304,12 +6910,15 @@ void genX(CmdResetEvent2)(
pc_bits |= ANV_PIPE_CS_STALL_BIT;
}

/* We have to use PIPE_CONTROL here as RESOURCE_BARRIER cannot write a 0
* value.
*/
genX(batch_emit_pipe_control_write)
(&cmd_buffer->batch, cmd_buffer->device->info,
cmd_buffer->state.current_pipeline, WriteImmediateData,
anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
event->state),
VK_EVENT_RESET,
0,
pc_bits,
"vkCmdResetEvent2");
break;
@@ -6328,20 +6937,68 @@ void genX(CmdWaitEvents2)(
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);

VkPipelineStageFlags2 final_src_stages = 0, final_dst_stages = 0;
enum anv_pipe_bits final_invalidates = 0;
for (uint32_t i = 0; i < eventCount; i++) {
ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
struct anv_address wait_addr =
anv_state_pool_state_address(
&cmd_buffer->device->dynamic_state_pool,
event->state);

VkPipelineStageFlags2 src_stages, dst_stages;
enum anv_pipe_bits bits;
cmd_buffer_accumulate_barrier_bits(cmd_buffer, 1, &pDependencyInfos[i],
&src_stages, &dst_stages, &bits);

if ((pDependencyInfos->dependencyFlags & VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR) == 0) {
/* Only consider the invalidate bits, the signal part will do the
* flushing.
*
* We cannot do this with VK_KHR_maintenance9's ASYMMETRIC_EVENT_BIT
* which allows the full barrier (with all access masks) to be only
* specified on the vkCmdWaitEvents2 entry point.
*/
bits &= ANV_PIPE_INVALIDATE_BITS;
}

/* Need main memory coherency */
if ((event->flags & VK_EVENT_CREATE_DEVICE_ONLY_BIT) == 0)
bits |= ANV_PIPE_END_OF_PIPE_SYNC_FORCE_FLUSH_L3_BIT;

#if GFX_VER >= 20
if (can_use_resource_barrier(cmd_buffer->device->info,
cmd_buffer->batch.engine_class,
src_stages, dst_stages, bits,
ANV_NULL_ADDRESS, wait_addr)) {
emit_resource_barrier(&cmd_buffer->batch,
cmd_buffer->device->info,
src_stages, dst_stages, bits,
ANV_NULL_ADDRESS, wait_addr);
continue;
}
#endif

anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
sem.WaitMode = PollingMode;
sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
sem.SemaphoreDataDword = VK_EVENT_SET;
sem.SemaphoreAddress = anv_state_pool_state_address(
&cmd_buffer->device->dynamic_state_pool,
event->state);
sem.CompareOperation = COMPARE_SAD_NOT_EQUAL_SDD;
sem.SemaphoreDataDword = 0;
sem.SemaphoreAddress = wait_addr;
}

final_src_stages |= src_stages;
final_dst_stages |= dst_stages;
final_invalidates |= bits;
}

cmd_buffer_barrier(cmd_buffer, eventCount, pDependencyInfos, "wait event");
if (final_src_stages != 0 ||
final_dst_stages != 0 ||
final_invalidates != 0) {
anv_add_pending_pipe_bits(cmd_buffer,
final_src_stages, final_dst_stages,
final_invalidates,
"wait event");
}
}

VkResult genX(CmdSetPerformanceOverrideINTEL)(
@@ -6365,6 +7022,8 @@ VkResult genX(CmdSetPerformanceOverrideINTEL)(
if (pOverrideInfo->enable) {
/* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_BARRIER_FLUSH_BITS |
ANV_PIPE_INVALIDATE_BITS,
"perf counter isolation");
@@ -6604,9 +7263,12 @@ genX(cmd_buffer_begin_companion_rcs_syncpoint)(
*/

if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_BARRIER_FLUSH_BITS |
ANV_PIPE_INVALIDATE_BITS |
ANV_PIPE_STALL_BITS,
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_BARRIER_FLUSH_BITS |
ANV_PIPE_INVALIDATE_BITS |
ANV_PIPE_STALL_BITS,
"post main cmd buffer invalidate");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
} else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
@@ -6676,6 +7338,8 @@ genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
* - unblock the CCS
*/
anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_BARRIER_FLUSH_BITS |
ANV_PIPE_INVALIDATE_BITS |
ANV_PIPE_STALL_BITS,
@@ -6822,7 +7486,10 @@ genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,

trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);

anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
anv_add_pending_pipe_bits(cmd_buffer,
stage,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
bits, "write buffer marker");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);

struct mi_builder b;


+ 4
- 2
src/intel/vulkan/genX_cmd_compute.c View File

@@ -136,8 +136,10 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer)
* sufficient."
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_CS_STALL_BIT,
"flush compute state");
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
ANV_PIPE_CS_STALL_BIT,
"flush compute state");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
#endif



+ 13
- 2
src/intel/vulkan/genX_cmd_draw.c View File

@@ -712,8 +712,9 @@ cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
* in the shader always send the color.
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
ANV_PIPE_RT_BTI_CHANGE,
"change RT due to shader outputs");
#endif
}
@@ -854,6 +855,8 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
*/
if (intel_needs_workaround(device->info, 16011411144)) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT,
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT,
ANV_PIPE_CS_STALL_BIT,
"before SO_BUFFER change WA");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
@@ -889,12 +892,16 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer)
if (intel_needs_workaround(device->info, 16011411144)) {
/* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT,
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT,
ANV_PIPE_CS_STALL_BIT,
"after SO_BUFFER change WA");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
} else if (GFX_VER >= 10) {
/* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT,
VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT,
ANV_PIPE_CS_STALL_BIT,
"after 3DSTATE_SO_BUFFER call");
}
@@ -2365,6 +2372,8 @@ void genX(CmdBeginTransformFeedbackEXT)(
* commands are processed. This will likely require a pipeline flush."
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CS_STALL_BIT,
"begin transform feedback");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
@@ -2417,6 +2426,8 @@ void genX(CmdEndTransformFeedbackEXT)(
* commands are processed. This will likely require a pipeline flush."
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CS_STALL_BIT,
"end transform feedback");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);


+ 3
- 3
src/intel/vulkan/genX_cmd_draw_generated_flush.h View File

@@ -46,15 +46,15 @@ genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
struct anv_batch *batch = &cmd_buffer->generation.batch;

/* Wait for all the generation vertex shader to generate the commands. */
genX(emit_apply_pipe_flushes)(batch,
cmd_buffer->device,
genX(batch_emit_pipe_control)(batch,
cmd_buffer->device->info,
_3D,
#if GFX_VER == 9
ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
#endif
ANV_PIPE_DATA_CACHE_FLUSH_BIT |
ANV_PIPE_CS_STALL_BIT,
NULL /* emitted_bits */);
"generated draw flush");

#if GFX_VER >= 12
anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {


+ 12
- 0
src/intel/vulkan/genX_cmd_draw_generated_indirect.h View File

@@ -548,6 +548,10 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd
struct anv_gen_indirect_params *params = params_state.map;

anv_add_pending_pipe_bits(cmd_buffer,
gen_kernel->stage == MESA_SHADER_FRAGMENT ?
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT :
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
#if GFX_VER == 9
ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
#endif
@@ -597,6 +601,10 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd
anv_batch_current_address(&cmd_buffer->batch);

anv_add_pending_pipe_bits(cmd_buffer,
gen_kernel->stage == MESA_SHADER_FRAGMENT ?
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT :
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
ANV_PIPE_CS_STALL_BIT,
"after generated draws batch");
@@ -623,6 +631,8 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd
mi_ensure_write_fence(&b);

anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
"after generated draws batch increment");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
@@ -645,6 +655,8 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd
mi_ensure_write_fence(&b);

anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
"after generated draws end");



+ 4
- 3
src/intel/vulkan/genX_gfx_state.c View File

@@ -3962,9 +3962,10 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)

#if INTEL_WA_18019816803_GFX_VER
if (IS_DIRTY(WA_18019816803)) {
genx_batch_emit_pipe_control(batch, device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_PSS_STALL_SYNC_BIT);
genX(batch_emit_pipe_control)(batch, device->info,
cmd_buffer->state.current_pipeline,
ANV_PIPE_PSS_STALL_SYNC_BIT,
"Wa_18019816803");
}
#endif



+ 14
- 24
src/intel/vulkan/genX_gpu_memcpy.c View File

@@ -318,11 +318,14 @@ genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
}

void
genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state,
bool wait_completion)
{
genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
NULL);
if (wait_completion) {
genX(batch_emit_pipe_control)(state->batch, state->device->info, _3D,
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
"Post GPU memcpy wait");
}

if (state->cmd_buffer) {
/* Flag all the instructions emitted by the memcpy. */
@@ -375,7 +378,11 @@ genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
void
genX(emit_so_memcpy_end)(struct anv_memcpy_state *state)
{
#if INTEL_WA_16013994831_GFX_VER
genX(batch_emit_pipe_control)(state->batch, state->device->info, _3D,
ANV_PIPE_END_OF_PIPE_SYNC_BIT,
"Post GPU memcpy wait");

#if INTEL_WA_16013994831_GFX_VER
/* Turn preemption back on when we're done */
if (intel_needs_workaround(state->device->info, 16013994831))
genX(batch_set_preemption)(state->batch, state->device, _3D, true);
@@ -396,29 +403,12 @@ genX(emit_so_memcpy)(struct anv_memcpy_state *state,
anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound,
&state->vb_dirty,
src, size)) {
genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
genX(batch_emit_pipe_control)(state->batch, state->device->info, _3D,
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
NULL);
"Gfx9 VB cache workaround");
memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
}

emit_so_memcpy(state, dst, src, size);
}

void
genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
struct anv_address dst, struct anv_address src,
uint32_t size)
{
if (size == 0)
return;

struct anv_memcpy_state state;
genX(emit_so_memcpy_init)(&state,
cmd_buffer->device,
cmd_buffer,
&cmd_buffer->batch);
emit_so_memcpy(&state, dst, src, size);
genX(emit_so_memcpy_fini)(&state);
}

+ 3
- 0
src/intel/vulkan/genX_init_state.c View File

@@ -666,6 +666,7 @@ init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
cm.Mask1 = 0xffff;
#if GFX_VERx10 >= 200
cm.Mask2 = 0xffff;
cm.UAVCoherencyMode = FlushDataportL1;
#endif
}
anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
@@ -808,6 +809,8 @@ init_compute_queue_state(struct anv_queue *queue)
cm.AsyncComputeThreadLimitMask = 0x7;
cm.ZPassAsyncComputeThreadLimitMask = 0x7;
cm.ZAsyncThrottlesettingsMask = 0x3;
cm.Mask2 = 0xffff;
cm.UAVCoherencyMode = FlushDataportL1;
#else
cm.PixelAsyncComputeThreadLimit = PACTL_Max24;
cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;


+ 19
- 2
src/intel/vulkan/genX_query.c View File

@@ -917,7 +917,10 @@ void genX(CmdResetQueryPool)(
* completed. Otherwise some timestamps written later with MI_STORE_*
* commands might race with the PIPE_CONTROL in the loop above.
*/
anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_CS_STALL_BIT,
"vkCmdResetQueryPool of timestamps");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
break;
@@ -1091,6 +1094,9 @@ append_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
return false;

anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_QUERY_BITS(
cmd_buffer->state.queries.clear_bits),
reason);
@@ -1735,6 +1741,9 @@ copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer,

if (needed_flushes) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
needed_flushes,
"CopyQueryPoolResults");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
@@ -1847,6 +1856,7 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
uint32_t query_count,
VkQueryResultFlags flags)
{
VkPipelineStageFlags2 wait_stages = 0;
enum anv_pipe_bits needed_flushes = 0;

trace_intel_begin_query_copy_shader(&cmd_buffer->trace);
@@ -1867,11 +1877,14 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
}

if ((cmd_buffer->state.queries.buffer_write_bits |
cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH)
cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH) {
wait_stages |= VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT;
needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
}

if ((cmd_buffer->state.queries.buffer_write_bits |
cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_DATA_FLUSH) {
wait_stages |= VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
needed_flushes |= (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
}
@@ -1901,6 +1914,8 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,

if (needed_flushes) {
anv_add_pending_pipe_bits(cmd_buffer,
wait_stages,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
needed_flushes | ANV_PIPE_END_OF_PIPE_SYNC_BIT,
"CopyQueryPoolResults");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
@@ -2071,6 +2086,8 @@ genX(CmdWriteAccelerationStructuresPropertiesKHR)(
*/
if (!ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info)) {
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_END_OF_PIPE_SYNC_BIT |
ANV_PIPE_DATA_CACHE_FLUSH_BIT,
"read BVH data using CS");


+ 5
- 12
src/intel/vulkan/genX_simple_shader.c View File

@@ -673,10 +673,9 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
/* TODO: switch to use INTEL_NEEDS_WA_14025112257 */
if (device->info->ver >= 20 &&
batch->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
enum anv_pipe_bits emitted_bits = 0;
genX(emit_apply_pipe_flushes)(batch, device, GPGPU,
genX(batch_emit_pipe_control)(batch, devinfo, GPGPU,
ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
&emitted_bits);
"Wa_14025112257");
}
}

@@ -693,15 +692,9 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
* these scoreboard related states, a MEDIA_STATE_FLUSH is
* sufficient."
*/
enum anv_pipe_bits emitted_bits = 0;
genX(emit_apply_pipe_flushes)(batch, device, GPGPU, ANV_PIPE_CS_STALL_BIT,
&emitted_bits);

/* If we have a command buffer allocated with the emission, update the
* pending bits.
*/
if (state->cmd_buffer)
anv_cmd_buffer_update_pending_query_bits(state->cmd_buffer, emitted_bits);
genX(batch_emit_pipe_control)(batch, devinfo, GPGPU,
ANV_PIPE_CS_STALL_BIT,
"pre MEDIA_VFE_STATE");

anv_batch_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
vfe.StackSize = 0;


+ 1
- 1
src/nouveau/vulkan/nvk_cmd_draw.c View File

@@ -3459,7 +3459,7 @@ nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
int8_t loc_att[NVK_MAX_RTS] = { -1, -1, -1, -1, -1, -1, -1, -1};
uint8_t max_loc = 0;
uint32_t att_used = 0;
for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
for (uint8_t a = 0; a < render->color_att_count; a++) {
if (dyn->cal.color_map[a] == MESA_VK_ATTACHMENT_UNUSED)
continue;



Loading…
Cancel
Save
Baidu
map