NPU Controller Modules¶
RTL source on GitHub
SystemVerilog sources documented on this page:
hw/rtl/NPU_Controller/npu_controller_top.sv— View on GitHubhw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_decoder.sv— View on GitHubhw/rtl/NPU_Controller/Global_Scheduler.sv— View on GitHub
1. Controller Top¶
npu_controller_top.sv integrates the AXI-Lite frontend, instruction
decoder, and global scheduler into a single controller boundary.
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| Module: npu_controller_top — control-plane wrapper |=====================
// Purpose : Aggregate AXIL frontend + opcode decoder behind one boundary.
// Hides FIFO/handshake/decoding complexity from NPU_top.
// Spec ref : pccx v002 §3 (ISA), §4 (control plane).
// Clock : clk (= clk_core, 400 MHz).
// Reset : rst_n active-low; i_clear synchronous soft-clear.
// Latency : Decode is 1-cycle registered after AXIL kick (ctrl_npu_decoder).
// Throughput : Issues at most 1 decoded uop per clock (ISA serial issue).
// Handshake : One-hot OUT_*_op_x64_valid pulses — exactly one (or none)
// asserted per cycle. Raw 60-bit body driven on OUT_op_x64.
// Backpressure : Decoder gates pop via fetch_PC_ready; AXIL frontend respects.
// Reset state : All OUT_*_op_x64_valid = 0; OUT_op_x64 = 0.
// Errors : none surfaced (illegal opcodes silently dropped — TODO).
// Counters : none.
// Assertions : (Stage C) one-hot of decoded valids; raw_instruction stable
// while pop_valid && !fetch_PC_ready.
// ===============================================================================
module npu_controller_top #() (
input logic clk,
input logic rst_n,
input logic i_clear,
// ===| AXI4-Lite Slave : PS <-> NPU control plane |=========================
axil_if.slave S_AXIL_CTRL,
// ===| Decoded Instruction Valids |=========================================
output logic OUT_GEMV_op_x64_valid,
output logic OUT_GEMM_op_x64_valid,
output logic OUT_memcpy_op_x64_valid,
output logic OUT_memset_op_x64_valid,
output logic OUT_cvo_op_x64_valid,
// ===| Raw Instruction Body (60-bit, opcode stripped) |=====================
output instruction_op_x64_t OUT_op_x64
);
// ===| Internal Wires |========================================================
logic [`ISA_WIDTH-1:0] raw_instruction;
logic raw_instruction_pop_valid;
logic fetch_PC_ready;
// ===| Frontend : AXI-Lite CMD/STAT |==========================================
ctrl_npu_frontend #() u_npu_frontend (
.clk (clk),
.rst_n (rst_n),
.IN_clear(i_clear),
.S_AXIL_CTRL(S_AXIL_CTRL),
.OUT_RAW_instruction(raw_instruction),
.OUT_kick (raw_instruction_pop_valid),
.IN_enc_stat ('0),
.IN_enc_valid(1'b0),
.IN_fetch_ready(fetch_PC_ready)
);
// ===| Decoder : Opcode -> Engine FIFOs |======================================
ctrl_npu_decoder u_decoder (
.clk (clk),
.rst_n (rst_n),
.IN_raw_instruction (raw_instruction),
.raw_instruction_pop_valid(raw_instruction_pop_valid),
.OUT_fetch_PC_ready (fetch_PC_ready),
.OUT_GEMV_op_x64_valid (OUT_GEMV_op_x64_valid),
.OUT_GEMM_op_x64_valid (OUT_GEMM_op_x64_valid),
.OUT_memcpy_op_x64_valid(OUT_memcpy_op_x64_valid),
.OUT_memset_op_x64_valid(OUT_memset_op_x64_valid),
.OUT_cvo_op_x64_valid (OUT_cvo_op_x64_valid),
.OUT_op_x64(OUT_op_x64)
);
endmodule
2. Instruction Decoder¶
ctrl_npu_decoder.sv parses the 64-bit VLIW instruction word: strips
the 4-bit opcode and routes the 60-bit body into the appropriate
typed struct (GEMV_op_x64_t, memcpy_op_x64_t, etc.).
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| Module: ctrl_npu_decoder — VLIW opcode → engine valid demux |============
// Purpose : Receive raw 64-bit VLIW instructions from the frontend FIFO,
// strip the 4-bit opcode, assert exactly one matching valid
// pulse for one cycle, and forward the 60-bit body to the
// Global Scheduler.
// Spec ref : pccx v002 §3 (ISA), §3.1 (opcode encoding).
// Clock : clk @ 400 MHz.
// Reset : rst_n active-low.
// Latency : 1-cycle registered (raw_instruction_pop_valid → OUT_*_valid).
// Throughput : 1 instruction/cycle (decoder is purely combinational save
// for the output register).
// Handshake : OUT_fetch_PC_ready asserted unconditionally — frontend FIFO
// provides buffering, decoder is single-cycle.
// Reset state : All OUT_*_op_x64_valid = 0; OUT_op_x64 = 0.
// Errors : Unknown opcodes are silently dropped (no valid asserted).
// Assertions : (Stage C) one-hot of OUT_*_op_x64_valid; valid pulses
// are exactly one cycle wide.
// Notes : OP_CVO uses a separate FF (cvo_valid_ff) outside the
// 4-bit OUT_valid bus because it is the 5th opcode.
// ===============================================================================
module ctrl_npu_decoder (
input logic clk,
input logic rst_n,
// ===| From Frontend |=======================================================
input logic [`ISA_WIDTH-1:0] IN_raw_instruction,
input logic raw_instruction_pop_valid,
// ===| Flow Control |========================================================
output logic OUT_fetch_PC_ready,
// ===| Decoded Valid Pulses (one-hot, one cycle) |===========================
output logic OUT_GEMV_op_x64_valid,
output logic OUT_GEMM_op_x64_valid,
output logic OUT_memcpy_op_x64_valid,
output logic OUT_memset_op_x64_valid,
output logic OUT_cvo_op_x64_valid,
// ===| Instruction Body (60-bit, opcode stripped) |=========================
output instruction_op_x64_t OUT_op_x64
);
// ===| Internal |==============================================================
logic [3:0] OUT_valid;
assign OUT_GEMV_op_x64_valid = OUT_valid[0];
assign OUT_GEMM_op_x64_valid = OUT_valid[1];
assign OUT_memcpy_op_x64_valid = OUT_valid[2];
assign OUT_memset_op_x64_valid = OUT_valid[3];
// CVO valid uses a separate FF (5th opcode)
logic cvo_valid_ff;
assign OUT_cvo_op_x64_valid = cvo_valid_ff;
// ===| Opcode Decoder |========================================================
// Top 4 bits are the opcode; bottom 60 bits are the instruction body.
always_ff @(posedge clk) begin
if (!rst_n) begin
OUT_valid <= 4'b0000;
cvo_valid_ff <= 1'b0;
OUT_op_x64 <= '0;
end else begin
OUT_valid <= 4'b0000;
cvo_valid_ff <= 1'b0;
if (raw_instruction_pop_valid) begin
// Body: bits [59:0] (opcode at [63:60] already stripped by slicing)
OUT_op_x64.instruction <= IN_raw_instruction[`ISA_BODY_WIDTH-1:0];
case (IN_raw_instruction[`ISA_WIDTH-1:`ISA_WIDTH-`ISA_OPCODE_WIDTH])
OP_GEMV: OUT_valid <= 4'b0001;
OP_GEMM: OUT_valid <= 4'b0010;
OP_MEMCPY: OUT_valid <= 4'b0100;
OP_MEMSET: OUT_valid <= 4'b1000;
OP_CVO: cvo_valid_ff <= 1'b1;
default: ; // unknown opcode: drop silently
endcase
end
end
end
// ===| Backpressure |==========================================================
// Always ready — the frontend FIFO provides buffering; the decoder is single-cycle.
assign OUT_fetch_PC_ready = 1'b1;
endmodule
3. Global Scheduler¶
Global_Scheduler.sv receives decoded instruction fields, emits
per-core control μops, tracks in-flight async instructions, maintains
the dependency scoreboard, and gates new dispatches when a hazard is
detected.
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| Module: Global_Scheduler — VLIW → engine micro-op translator |============
// Purpose : Decode 60-bit VLIW body and emit per-engine uops for
// GEMM / GEMV / MEMCPY / MEMSET / CVO with deterministic
// priority.
// Spec ref : pccx v002 §3 (ISA decode), §4.2 (uop semantics).
// Clock : clk_core @ 400 MHz.
// Reset : rst_n_core active-low.
// Latency : 1-cycle registered uop output after IN_*_op_x64_valid pulse.
// Throughput : 1 uop/cycle per output channel; channels are mutually
// exclusive in time per ISA serial-issue semantics.
// Priority : OUT_LOAD_uop arbitration order:
// GEMM > GEMV > MEMCPY > CVO (single-driver always_ff).
// Outputs :
// OUT_GEMM_uop / OUT_GEMV_uop / OUT_CVO_uop / OUT_mem_set_uop
// : registered at issue cycle, hold until next valid.
// OUT_STORE_uop : registered at issue; mem_dispatcher uses it to initiate
// result writeback after engine completion handshake.
// OUT_sram_rd_start : one-cycle pulse on GEMM/GEMV LOAD dispatch — starts
// preprocess_fmap broadcast from L1 cache.
// Reset state : all uops zeroed; OUT_sram_rd_start = 0.
// Errors : none surfaced (out-of-range fields wrap to MEMCPY default).
// Counters : none.
// Assertions : (Stage C) exactly-zero-or-one IN_*_op_x64_valid per cycle;
// OUT_sram_rd_start is one-cycle pulse only.
// ===============================================================================
module Global_Scheduler #() (
input logic clk_core,
input logic rst_n_core,
// ===| From ctrl_npu_decoder |===============================================
input logic IN_GEMV_op_x64_valid,
input logic IN_GEMM_op_x64_valid,
input logic IN_memcpy_op_x64_valid,
input logic IN_memset_op_x64_valid,
input logic IN_cvo_op_x64_valid,
input instruction_op_x64_t instruction,
// ===| Engine micro-ops |====================================================
output gemm_control_uop_t OUT_GEMM_uop,
output GEMV_control_uop_t OUT_GEMV_uop,
output memory_control_uop_t OUT_LOAD_uop,
output memory_control_uop_t OUT_STORE_uop,
output memory_set_uop_t OUT_mem_set_uop,
output cvo_control_uop_t OUT_CVO_uop,
// ===| Datapath control |====================================================
output logic OUT_sram_rd_start // pulse: start fmap cache broadcast
);
// ===| Combinational instruction body casts |==================================
GEMV_op_x64_t GEMV_op_x64;
GEMM_op_x64_t GEMM_op_x64;
memcpy_op_x64_t memcpy_op_x64;
memset_op_x64_t memset_op_x64;
cvo_op_x64_t cvo_op_x64;
always_comb begin
GEMV_op_x64 = GEMV_op_x64_t'(instruction.instruction);
GEMM_op_x64 = GEMM_op_x64_t'(instruction.instruction);
memcpy_op_x64 = memcpy_op_x64_t'(instruction.instruction);
memset_op_x64 = memset_op_x64_t'(instruction.instruction);
cvo_op_x64 = cvo_op_x64_t'(instruction.instruction);
end
// ===| MEMSET uop |============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_mem_set_uop <= '0;
end else if (IN_memset_op_x64_valid) begin
OUT_mem_set_uop <= '{
dest_cache : dest_cache_e'(memset_op_x64.dest_cache),
dest_addr : memset_op_x64.dest_addr,
a_value : memset_op_x64.a_value,
b_value : memset_op_x64.b_value,
c_value : memset_op_x64.c_value
};
end
end
// ===| MEMCPY route translation ===============================================
// from_device/to_device (1-bit each) → data_route_e (8-bit enum)
data_route_e memcpy_route;
always_comb begin
if (memcpy_op_x64.from_device == FROM_HOST && memcpy_op_x64.to_device == TO_NPU)
memcpy_route = from_host_to_L2;
else
memcpy_route = from_L2_to_host;
end
// ===| LOAD uop — single driver (priority: GEMM > GEMV > MEMCPY > CVO) |======
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_LOAD_uop <= '0;
OUT_sram_rd_start <= 1'b0;
end else begin
OUT_sram_rd_start <= 1'b0; // default: no pulse
if (IN_GEMM_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : from_L2_to_L1_GEMM,
dest_addr : '0,
src_addr : GEMM_op_x64.src_addr,
shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
async : SYNC_OP
};
OUT_sram_rd_start <= 1'b1;
end else if (IN_GEMV_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : from_L2_to_L1_GEMV,
dest_addr : '0,
src_addr : GEMV_op_x64.src_addr,
shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
async : SYNC_OP
};
OUT_sram_rd_start <= 1'b1;
end else if (IN_memcpy_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : memcpy_route,
dest_addr : memcpy_op_x64.dest_addr,
src_addr : memcpy_op_x64.src_addr,
shape_ptr_addr : memcpy_op_x64.shape_ptr_addr,
async : memcpy_op_x64.async
};
end else if (IN_cvo_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : from_L2_to_CVO,
dest_addr : '0,
src_addr : cvo_op_x64.src_addr,
shape_ptr_addr : '0,
async : cvo_op_x64.async
};
end
end
end
// ===| STORE uop — latched at issue time |=====================================
// Held until the engine signals completion (external handshake, not shown here).
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_STORE_uop <= '0;
end else if (IN_GEMM_op_x64_valid) begin
OUT_STORE_uop <= '{
data_dest : from_GEMM_res_to_L2,
dest_addr : GEMM_op_x64.dest_reg,
src_addr : '0,
shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
async : SYNC_OP
};
end else if (IN_GEMV_op_x64_valid) begin
OUT_STORE_uop <= '{
data_dest : from_GEMV_res_to_L2,
dest_addr : GEMV_op_x64.dest_reg,
src_addr : '0,
shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
async : SYNC_OP
};
end else if (IN_cvo_op_x64_valid) begin
OUT_STORE_uop <= '{
data_dest : from_CVO_res_to_L2,
dest_addr : cvo_op_x64.dst_addr,
src_addr : '0,
shape_ptr_addr : '0,
async : cvo_op_x64.async
};
end
end
// ===| GEMM uop |==============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_GEMM_uop <= '0;
end else if (IN_GEMM_op_x64_valid) begin
OUT_GEMM_uop <= '{
flags : GEMM_op_x64.flags,
size_ptr_addr : GEMM_op_x64.size_ptr_addr,
parallel_lane : GEMM_op_x64.parallel_lane
};
end
end
// ===| GEMV uop |==============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_GEMV_uop <= '0;
end else if (IN_GEMV_op_x64_valid) begin
OUT_GEMV_uop <= '{
flags : GEMV_op_x64.flags,
size_ptr_addr : GEMV_op_x64.size_ptr_addr,
parallel_lane : GEMV_op_x64.parallel_lane
};
end
end
// ===| CVO uop |===============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_CVO_uop <= '0;
end else if (IN_cvo_op_x64_valid) begin
OUT_CVO_uop <= '{
cvo_func : cvo_func_e'(cvo_op_x64.cvo_func),
src_addr : cvo_op_x64.src_addr,
dst_addr : cvo_op_x64.dst_addr,
length : cvo_op_x64.length,
flags : cvo_flags_t'(cvo_op_x64.flags),
async : cvo_op_x64.async
};
end
end
endmodule
Last verified against
Current public pccx-FPGA-NPU-LLM-kv260 main clone used by the
documentation CI. Controller source references should stay aligned with
files present in that public RTL tree.
See also
Per-Instruction Dataflow — dependency and completion tracking.