diff --git a/drivers/gpu/drm/v3d/Makefile b/drivers/gpu/drm/v3d/Makefile
index db4cfc1558219cfe88cbdc7d4a7c161d25ed0233..e8b31413702075993c41e0ae831d732b7e6f3b17 100644
--- a/drivers/gpu/drm/v3d/Makefile
+++ b/drivers/gpu/drm/v3d/Makefile
@@ -9,6 +9,7 @@ v3d-y := \
 	v3d_gem.o \
 	v3d_irq.o \
 	v3d_mmu.o \
+	v3d_perfmon.o \
 	v3d_trace_points.o \
 	v3d_sched.o
 
diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
index 99e22beea90b1dedecdda0e78112ced6f3c285d7..9403c3b36aca681c30c19fb973e65f574be9532c 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.c
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -94,6 +94,9 @@ static int v3d_get_param_ioctl(struct drm_device *dev, void *data,
 	case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
 		args->value = 1;
 		return 0;
+	case DRM_V3D_PARAM_SUPPORTS_PERFMON:
+		args->value = (v3d->ver >= 40);
+		return 0;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", args->param);
 		return -EINVAL;
@@ -121,6 +124,7 @@ v3d_open(struct drm_device *dev, struct drm_file *file)
 				      1, NULL);
 	}
 
+	v3d_perfmon_open_file(v3d_priv);
 	file->driver_priv = v3d_priv;
 
 	return 0;
@@ -136,6 +140,7 @@ v3d_postclose(struct drm_device *dev, struct drm_file *file)
 		drm_sched_entity_destroy(&v3d_priv->sched_entity[q]);
 	}
 
+	v3d_perfmon_close_file(v3d_priv);
 	kfree(v3d_priv);
 }
 
@@ -156,6 +161,9 @@ static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
 	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(V3D_PERFMON_CREATE, v3d_perfmon_create_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_PERFMON_DESTROY, v3d_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_PERFMON_GET_VALUES, v3d_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
 };
 
 static const struct drm_driver v3d_drm_driver = {
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index 8a390738d65baf20189d799d1481d4cd21f636f0..270134779073048e74ac49432a12f650b09cfa66 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -37,6 +37,40 @@ struct v3d_queue_state {
 	u64 emit_seqno;
 };
 
+/* Performance monitor object. The perform lifetime is controlled by userspace
+ * using perfmon related ioctls. A perfmon can be attached to a submit_cl
+ * request, and when this is the case, HW perf counters will be activated just
+ * before the submit_cl is submitted to the GPU and disabled when the job is
+ * done. This way, only events related to a specific job will be counted.
+ */
+struct v3d_perfmon {
+	/* Tracks the number of users of the perfmon, when this counter reaches
+	 * zero the perfmon is destroyed.
+	 */
+	refcount_t refcnt;
+
+	/* Protects perfmon stop, as it can be invoked from multiple places. */
+	struct mutex lock;
+
+	/* Number of counters activated in this perfmon instance
+	 * (should be less than DRM_V3D_MAX_PERF_COUNTERS).
+	 */
+	u8 ncounters;
+
+	/* Events counted by the HW perf counters. */
+	u8 counters[DRM_V3D_MAX_PERF_COUNTERS];
+
+	/* Storage for counter values. Counters are incremented by the
+	 * HW perf counter values every time the perfmon is attached
+	 * to a GPU job.  This way, perfmon users don't have to
+	 * retrieve the results after each job if they want to track
+	 * events covering several submissions.  Note that counter
+	 * values can't be reset, but you can fake a reset by
+	 * destroying the perfmon and creating a new one.
+	 */
+	u64 values[];
+};
+
 struct v3d_dev {
 	struct drm_device drm;
 
@@ -89,6 +123,9 @@ struct v3d_dev {
 	 */
 	spinlock_t job_lock;
 
+	/* Used to track the active perfmon if any. */
+	struct v3d_perfmon *active_perfmon;
+
 	/* Protects bo_stats */
 	struct mutex bo_lock;
 
@@ -133,6 +170,11 @@ v3d_has_csd(struct v3d_dev *v3d)
 struct v3d_file_priv {
 	struct v3d_dev *v3d;
 
+	struct {
+		struct idr idr;
+		struct mutex lock;
+	} perfmon;
+
 	struct drm_sched_entity sched_entity[V3D_MAX_QUEUES];
 };
 
@@ -205,6 +247,11 @@ struct v3d_job {
 	 */
 	struct dma_fence *done_fence;
 
+	/* Pointer to a performance monitor object if the user requested it,
+	 * NULL otherwise.
+	 */
+	struct v3d_perfmon *perfmon;
+
 	/* Callback for the freeing of the job on refcount going to 0. */
 	void (*free)(struct kref *ref);
 };
@@ -353,3 +400,19 @@ void v3d_mmu_remove_ptes(struct v3d_bo *bo);
 /* v3d_sched.c */
 int v3d_sched_init(struct v3d_dev *v3d);
 void v3d_sched_fini(struct v3d_dev *v3d);
+
+/* v3d_perfmon.c */
+void v3d_perfmon_get(struct v3d_perfmon *perfmon);
+void v3d_perfmon_put(struct v3d_perfmon *perfmon);
+void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon);
+void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+		      bool capture);
+struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id);
+void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv);
+void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv);
+int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv);
+int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv);
+int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 4eb3542269725f95227eeb8325fa03c2fdfba444..5689da118197ea87d4175d663abaf2a8a8c5ceef 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -126,6 +126,8 @@ v3d_reset(struct v3d_dev *v3d)
 	v3d_mmu_set_page_table(v3d);
 	v3d_irq_reset(v3d);
 
+	v3d_perfmon_stop(v3d, v3d->active_perfmon, false);
+
 	trace_v3d_reset_end(dev);
 }
 
@@ -375,6 +377,9 @@ v3d_job_free(struct kref *ref)
 	pm_runtime_mark_last_busy(job->v3d->drm.dev);
 	pm_runtime_put_autosuspend(job->v3d->drm.dev);
 
+	if (job->perfmon)
+		v3d_perfmon_put(job->perfmon);
+
 	kfree(job);
 }
 
@@ -539,6 +544,9 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 
 	trace_v3d_submit_cl_ioctl(&v3d->drm, args->rcl_start, args->rcl_end);
 
+	if (args->pad != 0)
+		return -EINVAL;
+
 	if (args->flags != 0 &&
 	    args->flags != DRM_V3D_SUBMIT_CL_FLUSH_CACHE) {
 		DRM_INFO("invalid flags: %d\n", args->flags);
@@ -611,8 +619,20 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	if (args->perfmon_id) {
+		render->base.perfmon = v3d_perfmon_find(v3d_priv,
+							args->perfmon_id);
+
+		if (!render->base.perfmon) {
+			ret = -ENOENT;
+			goto fail;
+		}
+	}
+
 	mutex_lock(&v3d->sched_lock);
 	if (bin) {
+		bin->base.perfmon = render->base.perfmon;
+		v3d_perfmon_get(bin->base.perfmon);
 		ret = v3d_push_job(v3d_priv, &bin->base, V3D_BIN);
 		if (ret)
 			goto fail_unreserve;
@@ -633,6 +653,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 		ret = drm_gem_fence_array_add(&clean_job->deps, render_fence);
 		if (ret)
 			goto fail_unreserve;
+		clean_job->perfmon = render->base.perfmon;
+		v3d_perfmon_get(clean_job->perfmon);
 		ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN);
 		if (ret)
 			goto fail_unreserve;
@@ -827,6 +849,15 @@ v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	if (args->perfmon_id) {
+		job->base.perfmon = v3d_perfmon_find(v3d_priv,
+						     args->perfmon_id);
+		if (!job->base.perfmon) {
+			ret = -ENOENT;
+			goto fail;
+		}
+	}
+
 	mutex_lock(&v3d->sched_lock);
 	ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD);
 	if (ret)
diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c b/drivers/gpu/drm/v3d/v3d_perfmon.c
new file mode 100644
index 0000000000000000000000000000000000000000..0288ef063513e6e2751d30fb5dfd3fd17c40a94a
--- /dev/null
+++ b/drivers/gpu/drm/v3d/v3d_perfmon.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Raspberry Pi
+ */
+
+#include "v3d_drv.h"
+#include "v3d_regs.h"
+
+#define V3D_PERFMONID_MIN	1
+#define V3D_PERFMONID_MAX	U32_MAX
+
+void v3d_perfmon_get(struct v3d_perfmon *perfmon)
+{
+	if (perfmon)
+		refcount_inc(&perfmon->refcnt);
+}
+
+void v3d_perfmon_put(struct v3d_perfmon *perfmon)
+{
+	if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
+		kfree(perfmon);
+}
+
+void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon)
+{
+	unsigned int i;
+	u32 mask;
+	u8 ncounters = perfmon->ncounters;
+
+	if (WARN_ON_ONCE(!perfmon || v3d->active_perfmon))
+		return;
+
+	mask = GENMASK(ncounters - 1, 0);
+
+	for (i = 0; i < ncounters; i++) {
+		u32 source = i / 4;
+		u32 channel = V3D_SET_FIELD(perfmon->counters[i], V3D_PCTR_S0);
+
+		i++;
+		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
+					 V3D_PCTR_S1);
+		i++;
+		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
+					 V3D_PCTR_S2);
+		i++;
+		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
+					 V3D_PCTR_S3);
+		V3D_CORE_WRITE(0, V3D_V4_PCTR_0_SRC_X(source), channel);
+	}
+
+	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_CLR, mask);
+	V3D_CORE_WRITE(0, V3D_PCTR_0_OVERFLOW, mask);
+	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, mask);
+
+	v3d->active_perfmon = perfmon;
+}
+
+void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+		      bool capture)
+{
+	unsigned int i;
+
+	if (!perfmon || !v3d->active_perfmon)
+		return;
+
+	mutex_lock(&perfmon->lock);
+	if (perfmon != v3d->active_perfmon) {
+		mutex_unlock(&perfmon->lock);
+		return;
+	}
+
+	if (capture)
+		for (i = 0; i < perfmon->ncounters; i++)
+			perfmon->values[i] += V3D_CORE_READ(0, V3D_PCTR_0_PCTRX(i));
+
+	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0);
+
+	v3d->active_perfmon = NULL;
+	mutex_unlock(&perfmon->lock);
+}
+
+struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id)
+{
+	struct v3d_perfmon *perfmon;
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	perfmon = idr_find(&v3d_priv->perfmon.idr, id);
+	v3d_perfmon_get(perfmon);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	return perfmon;
+}
+
+void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv)
+{
+	mutex_init(&v3d_priv->perfmon.lock);
+	idr_init(&v3d_priv->perfmon.idr);
+}
+
+static int v3d_perfmon_idr_del(int id, void *elem, void *data)
+{
+	struct v3d_perfmon *perfmon = elem;
+
+	v3d_perfmon_put(perfmon);
+
+	return 0;
+}
+
+void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv)
+{
+	mutex_lock(&v3d_priv->perfmon.lock);
+	idr_for_each(&v3d_priv->perfmon.idr, v3d_perfmon_idr_del, NULL);
+	idr_destroy(&v3d_priv->perfmon.idr);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+}
+
+int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv)
+{
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_perfmon_create *req = data;
+	struct v3d_perfmon *perfmon;
+	unsigned int i;
+	int ret;
+
+	/* Number of monitored counters cannot exceed HW limits. */
+	if (req->ncounters > DRM_V3D_MAX_PERF_COUNTERS ||
+	    !req->ncounters)
+		return -EINVAL;
+
+	/* Make sure all counters are valid. */
+	for (i = 0; i < req->ncounters; i++) {
+		if (req->counters[i] >= V3D_PERFCNT_NUM)
+			return -EINVAL;
+	}
+
+	perfmon = kzalloc(struct_size(perfmon, values, req->ncounters),
+			  GFP_KERNEL);
+	if (!perfmon)
+		return -ENOMEM;
+
+	for (i = 0; i < req->ncounters; i++)
+		perfmon->counters[i] = req->counters[i];
+
+	perfmon->ncounters = req->ncounters;
+
+	refcount_set(&perfmon->refcnt, 1);
+	mutex_init(&perfmon->lock);
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	ret = idr_alloc(&v3d_priv->perfmon.idr, perfmon, V3D_PERFMONID_MIN,
+			V3D_PERFMONID_MAX, GFP_KERNEL);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	if (ret < 0) {
+		kfree(perfmon);
+		return ret;
+	}
+
+	req->id = ret;
+
+	return 0;
+}
+
+int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv)
+{
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_perfmon_destroy *req = data;
+	struct v3d_perfmon *perfmon;
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	perfmon = idr_remove(&v3d_priv->perfmon.idr, req->id);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	v3d_perfmon_put(perfmon);
+
+	return 0;
+}
+
+int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+				 struct drm_file *file_priv)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_perfmon_get_values *req = data;
+	struct v3d_perfmon *perfmon;
+	int ret = 0;
+
+	if (req->pad != 0)
+		return -EINVAL;
+
+	mutex_lock(&v3d_priv->perfmon.lock);
+	perfmon = idr_find(&v3d_priv->perfmon.idr, req->id);
+	v3d_perfmon_get(perfmon);
+	mutex_unlock(&v3d_priv->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	v3d_perfmon_stop(v3d, perfmon, true);
+
+	if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->values,
+			 perfmon->ncounters * sizeof(u64)))
+		ret = -EFAULT;
+
+	v3d_perfmon_put(perfmon);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
index 9bcb57781d313f9b923d7594a7fff6cbfb92a2b6..3663e0d6bf766f84185fd16b7301731870e6eb97 100644
--- a/drivers/gpu/drm/v3d/v3d_regs.h
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
@@ -347,6 +347,8 @@
 /* Each src reg muxes four counters each. */
 #define V3D_V4_PCTR_0_SRC_0_3                          0x00660
 #define V3D_V4_PCTR_0_SRC_28_31                        0x0067c
+#define V3D_V4_PCTR_0_SRC_X(x)                         (V3D_V4_PCTR_0_SRC_0_3 + \
+							4 * (x))
 # define V3D_PCTR_S0_MASK                              V3D_MASK(6, 0)
 # define V3D_PCTR_S0_SHIFT                             0
 # define V3D_PCTR_S1_MASK                              V3D_MASK(14, 8)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index a39bdd5cfc4f7c953a70c63891711b8b911b3f19..dd7fcc36d726a289de0a6c8dac969de05d3523ac 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -63,6 +63,16 @@ v3d_job_free(struct drm_sched_job *sched_job)
 	v3d_job_put(job);
 }
 
+static void
+v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
+{
+	if (job->perfmon != v3d->active_perfmon)
+		v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
+
+	if (job->perfmon && v3d->active_perfmon != job->perfmon)
+		v3d_perfmon_start(v3d, job->perfmon);
+}
+
 /*
  * Returns the fences that the job depends on, one by one.
  *
@@ -120,6 +130,8 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
 	trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
 			    job->start, job->end);
 
+	v3d_switch_perfmon(v3d, &job->base);
+
 	/* Set the current and end address of the control list.
 	 * Writing the end register is what starts the job.
 	 */
@@ -169,6 +181,8 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)
 	trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
 			    job->start, job->end);
 
+	v3d_switch_perfmon(v3d, &job->base);
+
 	/* XXX: Set the QCFG */
 
 	/* Set the current and end address of the control list.
@@ -240,6 +254,8 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)
 
 	trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
 
+	v3d_switch_perfmon(v3d, &job->base);
+
 	for (i = 1; i <= 6; i++)
 		V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
 	/* CFG0 write kicks off the job. */
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index 1ce746e228d99bc7c3569c48692a0dcc490c17fa..4104f22fb3d341a0074c7a987aa35b2b93cf8f40 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -38,6 +38,9 @@ extern "C" {
 #define DRM_V3D_GET_BO_OFFSET                     0x05
 #define DRM_V3D_SUBMIT_TFU                        0x06
 #define DRM_V3D_SUBMIT_CSD                        0x07
+#define DRM_V3D_PERFMON_CREATE                    0x08
+#define DRM_V3D_PERFMON_DESTROY                   0x09
+#define DRM_V3D_PERFMON_GET_VALUES                0x0a
 
 #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
 #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
@@ -47,6 +50,12 @@ extern "C" {
 #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
 #define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
 #define DRM_IOCTL_V3D_SUBMIT_CSD          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd)
+#define DRM_IOCTL_V3D_PERFMON_CREATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_CREATE, \
+						   struct drm_v3d_perfmon_create)
+#define DRM_IOCTL_V3D_PERFMON_DESTROY     DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_DESTROY, \
+						   struct drm_v3d_perfmon_destroy)
+#define DRM_IOCTL_V3D_PERFMON_GET_VALUES  DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_GET_VALUES, \
+						   struct drm_v3d_perfmon_get_values)
 
 #define DRM_V3D_SUBMIT_CL_FLUSH_CACHE             0x01
 
@@ -127,6 +136,11 @@ struct drm_v3d_submit_cl {
 	__u32 bo_handle_count;
 
 	__u32 flags;
+
+	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
+	__u32 perfmon_id;
+
+	__u32 pad;
 };
 
 /**
@@ -195,6 +209,7 @@ enum drm_v3d_param {
 	DRM_V3D_PARAM_SUPPORTS_TFU,
 	DRM_V3D_PARAM_SUPPORTS_CSD,
 	DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH,
+	DRM_V3D_PARAM_SUPPORTS_PERFMON,
 };
 
 struct drm_v3d_get_param {
@@ -258,6 +273,127 @@ struct drm_v3d_submit_csd {
 	__u32 in_sync;
 	/* Sync object to signal when the CSD job is done. */
 	__u32 out_sync;
+
+	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
+	__u32 perfmon_id;
+};
+
+enum {
+	V3D_PERFCNT_FEP_VALID_PRIMTS_NO_PIXELS,
+	V3D_PERFCNT_FEP_VALID_PRIMS,
+	V3D_PERFCNT_FEP_EZ_NFCLIP_QUADS,
+	V3D_PERFCNT_FEP_VALID_QUADS,
+	V3D_PERFCNT_TLB_QUADS_STENCIL_FAIL,
+	V3D_PERFCNT_TLB_QUADS_STENCILZ_FAIL,
+	V3D_PERFCNT_TLB_QUADS_STENCILZ_PASS,
+	V3D_PERFCNT_TLB_QUADS_ZERO_COV,
+	V3D_PERFCNT_TLB_QUADS_NONZERO_COV,
+	V3D_PERFCNT_TLB_QUADS_WRITTEN,
+	V3D_PERFCNT_PTB_PRIM_VIEWPOINT_DISCARD,
+	V3D_PERFCNT_PTB_PRIM_CLIP,
+	V3D_PERFCNT_PTB_PRIM_REV,
+	V3D_PERFCNT_QPU_IDLE_CYCLES,
+	V3D_PERFCNT_QPU_ACTIVE_CYCLES_VERTEX_COORD_USER,
+	V3D_PERFCNT_QPU_ACTIVE_CYCLES_FRAG,
+	V3D_PERFCNT_QPU_CYCLES_VALID_INSTR,
+	V3D_PERFCNT_QPU_CYCLES_TMU_STALL,
+	V3D_PERFCNT_QPU_CYCLES_SCOREBOARD_STALL,
+	V3D_PERFCNT_QPU_CYCLES_VARYINGS_STALL,
+	V3D_PERFCNT_QPU_IC_HIT,
+	V3D_PERFCNT_QPU_IC_MISS,
+	V3D_PERFCNT_QPU_UC_HIT,
+	V3D_PERFCNT_QPU_UC_MISS,
+	V3D_PERFCNT_TMU_TCACHE_ACCESS,
+	V3D_PERFCNT_TMU_TCACHE_MISS,
+	V3D_PERFCNT_VPM_VDW_STALL,
+	V3D_PERFCNT_VPM_VCD_STALL,
+	V3D_PERFCNT_BIN_ACTIVE,
+	V3D_PERFCNT_RDR_ACTIVE,
+	V3D_PERFCNT_L2T_HITS,
+	V3D_PERFCNT_L2T_MISSES,
+	V3D_PERFCNT_CYCLE_COUNT,
+	V3D_PERFCNT_QPU_CYCLES_STALLED_VERTEX_COORD_USER,
+	V3D_PERFCNT_QPU_CYCLES_STALLED_FRAGMENT,
+	V3D_PERFCNT_PTB_PRIMS_BINNED,
+	V3D_PERFCNT_AXI_WRITES_WATCH_0,
+	V3D_PERFCNT_AXI_READS_WATCH_0,
+	V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_0,
+	V3D_PERFCNT_AXI_READ_STALLS_WATCH_0,
+	V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_0,
+	V3D_PERFCNT_AXI_READ_BYTES_WATCH_0,
+	V3D_PERFCNT_AXI_WRITES_WATCH_1,
+	V3D_PERFCNT_AXI_READS_WATCH_1,
+	V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_1,
+	V3D_PERFCNT_AXI_READ_STALLS_WATCH_1,
+	V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_1,
+	V3D_PERFCNT_AXI_READ_BYTES_WATCH_1,
+	V3D_PERFCNT_TLB_PARTIAL_QUADS,
+	V3D_PERFCNT_TMU_CONFIG_ACCESSES,
+	V3D_PERFCNT_L2T_NO_ID_STALL,
+	V3D_PERFCNT_L2T_COM_QUE_STALL,
+	V3D_PERFCNT_L2T_TMU_WRITES,
+	V3D_PERFCNT_TMU_ACTIVE_CYCLES,
+	V3D_PERFCNT_TMU_STALLED_CYCLES,
+	V3D_PERFCNT_CLE_ACTIVE,
+	V3D_PERFCNT_L2T_TMU_READS,
+	V3D_PERFCNT_L2T_CLE_READS,
+	V3D_PERFCNT_L2T_VCD_READS,
+	V3D_PERFCNT_L2T_TMUCFG_READS,
+	V3D_PERFCNT_L2T_SLC0_READS,
+	V3D_PERFCNT_L2T_SLC1_READS,
+	V3D_PERFCNT_L2T_SLC2_READS,
+	V3D_PERFCNT_L2T_TMU_W_MISSES,
+	V3D_PERFCNT_L2T_TMU_R_MISSES,
+	V3D_PERFCNT_L2T_CLE_MISSES,
+	V3D_PERFCNT_L2T_VCD_MISSES,
+	V3D_PERFCNT_L2T_TMUCFG_MISSES,
+	V3D_PERFCNT_L2T_SLC0_MISSES,
+	V3D_PERFCNT_L2T_SLC1_MISSES,
+	V3D_PERFCNT_L2T_SLC2_MISSES,
+	V3D_PERFCNT_CORE_MEM_WRITES,
+	V3D_PERFCNT_L2T_MEM_WRITES,
+	V3D_PERFCNT_PTB_MEM_WRITES,
+	V3D_PERFCNT_TLB_MEM_WRITES,
+	V3D_PERFCNT_CORE_MEM_READS,
+	V3D_PERFCNT_L2T_MEM_READS,
+	V3D_PERFCNT_PTB_MEM_READS,
+	V3D_PERFCNT_PSE_MEM_READS,
+	V3D_PERFCNT_TLB_MEM_READS,
+	V3D_PERFCNT_GMP_MEM_READS,
+	V3D_PERFCNT_PTB_W_MEM_WORDS,
+	V3D_PERFCNT_TLB_W_MEM_WORDS,
+	V3D_PERFCNT_PSE_R_MEM_WORDS,
+	V3D_PERFCNT_TLB_R_MEM_WORDS,
+	V3D_PERFCNT_TMU_MRU_HITS,
+	V3D_PERFCNT_COMPUTE_ACTIVE,
+	V3D_PERFCNT_NUM,
+};
+
+#define DRM_V3D_MAX_PERF_COUNTERS                 32
+
+struct drm_v3d_perfmon_create {
+	__u32 id;
+	__u32 ncounters;
+	__u8 counters[DRM_V3D_MAX_PERF_COUNTERS];
+};
+
+struct drm_v3d_perfmon_destroy {
+	__u32 id;
+};
+
+/*
+ * Returns the values of the performance counters tracked by this
+ * perfmon (as an array of ncounters u64 values).
+ *
+ * No implicit synchronization is performed, so the user has to
+ * guarantee that any jobs using this perfmon have already been
+ * completed  (probably by blocking on the seqno returned by the
+ * last exec that used the perfmon).
+ */
+struct drm_v3d_perfmon_get_values {
+	__u32 id;
+	__u32 pad;
+	__u64 values_ptr;
 };
 
 #if defined(__cplusplus)