drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Source file repositories/reference/linux-study-clean/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

File Facts

System: Linux kernel
Corpus path: drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
Extension: .c
Size: 14194 bytes
Lines: 518
Domain: Driver Families
Bucket: drivers/gpu
Inferred role: Driver Families: implementation source
Status: source implementation candidate

Why This File Exists

Repeatable hardware-adapter layer. Deep compatibility for every driver is out of scope; this atlas records patterns, probe lifecycles, bus glue, IRQ/DMA usage, and links back to core abstractions.

Repeatable hardware-adapter layer. Deep compatibility for every driver is out of scope; this atlas records patterns, probe lifecycles, bus glue, IRQ/DMA usage, and links back to core abstractions.
Uses kernel synchronization; read lock ordering, sleepability, and interrupt context assumptions before translating.
Allocates kernel memory; connect allocation flags and lifetime to context constraints.
Defines or uses C structs; map object ownership, embedded links, reference counts, and lock ownership.

Dependency Surface

linux/kthread.h
linux/wait.h
linux/sched.h
drm/drm_drv.h
amdgpu.h
amdgpu_trace.h
amdgpu_reset.h
amdgpu_dev_coredump.h
amdgpu_xgmi.h

Detected Declarations

function files
function amdgpu_job_core_dump
function amdgpu_job_timedout
function amdgpu_ring_is_reset_type_supported
function amdgpu_job_alloc
function amdgpu_job_alloc_with_ib
function amdgpu_job_set_resources
function amdgpu_job_free_resources
function amdgpu_job_free_cb
function amdgpu_job_set_gang_leader
function amdgpu_job_free
function amdgpu_job_submit_direct
function amdgpu_job_prepare_job
function drm_sched_entity_queue_pop
function amdgpu_job_stop_all_jobs_on_sched

Annotated Snippet

amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
		dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
			s_job->sched->name);
		goto exit;
	}

	dev_err(adev->dev, "ring %s timeout, signaled seq=%u, emitted seq=%u\n",
		job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
		ring->fence_drv.sync_seq);

	ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
	if (ti) {
		amdgpu_vm_print_task_info(adev, ti);
		info = &ti->task;
	}

	/* attempt a per ring reset */
	if (amdgpu_gpu_recovery &&
	    amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_PER_QUEUE) &&
	    ring->funcs->reset) {
		dev_err(adev->dev, "Starting %s ring reset\n",
			s_job->sched->name);
		/* Stop the scheduler to prevent anybody else from touching the ring buffer. */
		drm_sched_wqueue_stop(&ring->sched);
		r = amdgpu_ring_reset(ring, job->vmid, job->hw_fence);
		if (!r) {
			/* Start the scheduler again */
			drm_sched_wqueue_start(&ring->sched);
			atomic_inc(&ring->adev->gpu_reset_counter);
			dev_err(adev->dev, "Ring %s reset succeeded\n",
				ring->sched.name);
			drm_dev_wedged_event(adev_to_drm(adev),
					     DRM_WEDGE_RECOVERY_NONE, info);
			goto exit;
		}
		dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
	}

	if (dma_fence_get_status(&s_job->s_fence->finished) == 0)
		dma_fence_set_error(&s_job->s_fence->finished, -ETIME);

	if (amdgpu_device_should_recover_gpu(ring->adev)) {
		struct amdgpu_reset_context reset_context;
		memset(&reset_context, 0, sizeof(reset_context));

		reset_context.method = AMD_RESET_METHOD_NONE;
		reset_context.reset_req_dev = adev;
		reset_context.src = AMDGPU_RESET_SRC_JOB;
		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

		/*
		 * To avoid an unnecessary extra coredump, as we have already
		 * got the very close representation of GPU's error status
		 */
		set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);

		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
		if (r)
			dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
	} else {
		drm_sched_suspend_timeout(&ring->sched);
		if (amdgpu_sriov_vf(adev))
			adev->virt.tdr_debug = true;
	}

exit:
	amdgpu_vm_put_task_info(ti);
	drm_dev_exit(idx);
	/* This is needed to add the job back to the pending list */
	return DRM_GPU_SCHED_STAT_NO_HANG;
}

int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
		     struct drm_sched_entity *entity, void *owner,
		     unsigned int num_ibs, struct amdgpu_job **job,
		     u64 drm_client_id)
{
	struct amdgpu_fence *af;
	int r;

	if (num_ibs == 0)
		return -EINVAL;

	*job = kzalloc_flex(**job, ibs, num_ibs);
	if (!*job)
		return -ENOMEM;

	af = kzalloc_obj(struct amdgpu_fence);
	if (!af) {
		r = -ENOMEM;

Annotation

Immediate include surface: `linux/kthread.h`, `linux/wait.h`, `linux/sched.h`, `drm/drm_drv.h`, `amdgpu.h`, `amdgpu_trace.h`, `amdgpu_reset.h`, `amdgpu_dev_coredump.h`.
Detected declarations: `function files`, `function amdgpu_job_core_dump`, `function amdgpu_job_timedout`, `function amdgpu_ring_is_reset_type_supported`, `function amdgpu_job_alloc`, `function amdgpu_job_alloc_with_ib`, `function amdgpu_job_set_resources`, `function amdgpu_job_free_resources`, `function amdgpu_job_free_cb`, `function amdgpu_job_set_gang_leader`.
Atlas domain: Driver Families / drivers/gpu.
Implementation status: source implementation candidate.
Synchronization appears in or near this file; preserve lock ordering, sleepability, and interrupt-context constraints.

Implementation Notes

This generated page is the file-by-file coverage layer; curated subsystem chapters should link here when they synthesize a multi-file control flow.
Core OS pages should be promoted from atlas-only to deep-reviewed when they explain data structures, invariants, locking, lifecycle, and C implementation snippets.
Driver-family pages are intentionally pattern-oriented unless they are part of the selected PCIe/NVMe representative device path.