NPU Controller Modules

RTL source on GitHub

SystemVerilog sources documented on this page:

1. Controller Top

npu_controller_top.sv integrates the AXI-Lite frontend, instruction decoder, and global scheduler into a single controller boundary.

Listing 14 hw/rtl/NPU_Controller/npu_controller_top.sv
`timescale 1ns / 1ps

`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| Module: npu_controller_top — control-plane wrapper |=====================
// Purpose      : Aggregate AXIL frontend + opcode decoder behind one boundary.
//                Hides FIFO/handshake/decoding complexity from NPU_top.
// Spec ref     : pccx v002 §3 (ISA), §4 (control plane).
// Clock        : clk (= clk_core, 400 MHz).
// Reset        : rst_n active-low; i_clear synchronous soft-clear.
// Latency      : Decode is 1-cycle registered after AXIL kick (ctrl_npu_decoder).
// Throughput   : Issues at most 1 decoded uop per clock (ISA serial issue).
// Handshake    : One-hot OUT_*_op_x64_valid pulses — exactly one (or none)
//                asserted per cycle. Raw 60-bit body driven on OUT_op_x64.
// Backpressure : Decoder gates pop via fetch_PC_ready; AXIL frontend respects.
// Reset state  : All OUT_*_op_x64_valid = 0; OUT_op_x64 = 0.
// Errors       : none surfaced (illegal opcodes silently dropped — TODO).
// Counters     : none.
// Assertions   : (Stage C) one-hot of decoded valids; raw_instruction stable
//                while pop_valid && !fetch_PC_ready.
// ===============================================================================

module npu_controller_top #() (
    input logic clk,
    input logic rst_n,
    input logic i_clear,

    // ===| AXI4-Lite Slave : PS <-> NPU control plane |=========================
    axil_if.slave S_AXIL_CTRL,

    // ===| Decoded Instruction Valids |=========================================
    output logic OUT_GEMV_op_x64_valid,
    output logic OUT_GEMM_op_x64_valid,
    output logic OUT_memcpy_op_x64_valid,
    output logic OUT_memset_op_x64_valid,
    output logic OUT_cvo_op_x64_valid,

    // ===| Raw Instruction Body (60-bit, opcode stripped) |=====================
    output instruction_op_x64_t OUT_op_x64
);

  // ===| Internal Wires |========================================================
  logic [`ISA_WIDTH-1:0] raw_instruction;
  logic                  raw_instruction_pop_valid;
  logic                  fetch_PC_ready;

  // ===| Frontend : AXI-Lite CMD/STAT |==========================================
  ctrl_npu_frontend #() u_npu_frontend (
      .clk     (clk),
      .rst_n   (rst_n),
      .IN_clear(i_clear),

      .S_AXIL_CTRL(S_AXIL_CTRL),

      .OUT_RAW_instruction(raw_instruction),
      .OUT_kick           (raw_instruction_pop_valid),

      .IN_enc_stat ('0),
      .IN_enc_valid(1'b0),

      .IN_fetch_ready(fetch_PC_ready)
  );

  // ===| Decoder : Opcode -> Engine FIFOs |======================================
  ctrl_npu_decoder u_decoder (
      .clk                    (clk),
      .rst_n                  (rst_n),
      .IN_raw_instruction     (raw_instruction),
      .raw_instruction_pop_valid(raw_instruction_pop_valid),

      .OUT_fetch_PC_ready     (fetch_PC_ready),

      .OUT_GEMV_op_x64_valid  (OUT_GEMV_op_x64_valid),
      .OUT_GEMM_op_x64_valid  (OUT_GEMM_op_x64_valid),
      .OUT_memcpy_op_x64_valid(OUT_memcpy_op_x64_valid),
      .OUT_memset_op_x64_valid(OUT_memset_op_x64_valid),
      .OUT_cvo_op_x64_valid   (OUT_cvo_op_x64_valid),

      .OUT_op_x64(OUT_op_x64)
  );

endmodule

2. Instruction Decoder

ctrl_npu_decoder.sv parses the 64-bit VLIW instruction word: strips the 4-bit opcode and routes the 60-bit body into the appropriate typed struct (GEMV_op_x64_t, memcpy_op_x64_t, etc.).

Listing 15 hw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_decoder.sv
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| Module: ctrl_npu_decoder — VLIW opcode → engine valid demux |============
// Purpose      : Receive raw 64-bit VLIW instructions from the frontend FIFO,
//                strip the 4-bit opcode, assert exactly one matching valid
//                pulse for one cycle, and forward the 60-bit body to the
//                Global Scheduler.
// Spec ref     : pccx v002 §3 (ISA), §3.1 (opcode encoding).
// Clock        : clk @ 400 MHz.
// Reset        : rst_n active-low.
// Latency      : 1-cycle registered (raw_instruction_pop_valid → OUT_*_valid).
// Throughput   : 1 instruction/cycle (decoder is purely combinational save
//                for the output register).
// Handshake    : OUT_fetch_PC_ready asserted unconditionally — frontend FIFO
//                provides buffering, decoder is single-cycle.
// Reset state  : All OUT_*_op_x64_valid = 0; OUT_op_x64 = 0.
// Errors       : Unknown opcodes are silently dropped (no valid asserted).
// Assertions   : (Stage C) one-hot of OUT_*_op_x64_valid; valid pulses
//                are exactly one cycle wide.
// Notes        : OP_CVO uses a separate FF (cvo_valid_ff) outside the
//                4-bit OUT_valid bus because it is the 5th opcode.
// ===============================================================================

module ctrl_npu_decoder (
    input logic clk,
    input logic rst_n,

    // ===| From Frontend |=======================================================
    input logic [`ISA_WIDTH-1:0] IN_raw_instruction,
    input logic                  raw_instruction_pop_valid,

    // ===| Flow Control |========================================================
    output logic OUT_fetch_PC_ready,

    // ===| Decoded Valid Pulses (one-hot, one cycle) |===========================
    output logic OUT_GEMV_op_x64_valid,
    output logic OUT_GEMM_op_x64_valid,
    output logic OUT_memcpy_op_x64_valid,
    output logic OUT_memset_op_x64_valid,
    output logic OUT_cvo_op_x64_valid,

    // ===| Instruction Body (60-bit, opcode stripped) |=========================
    output instruction_op_x64_t OUT_op_x64
);

  // ===| Internal |==============================================================
  logic [3:0] OUT_valid;
  assign OUT_GEMV_op_x64_valid   = OUT_valid[0];
  assign OUT_GEMM_op_x64_valid   = OUT_valid[1];
  assign OUT_memcpy_op_x64_valid = OUT_valid[2];
  assign OUT_memset_op_x64_valid = OUT_valid[3];
  // CVO valid uses a separate FF (5th opcode)
  logic cvo_valid_ff;
  assign OUT_cvo_op_x64_valid = cvo_valid_ff;

  // ===| Opcode Decoder |========================================================
  // Top 4 bits are the opcode; bottom 60 bits are the instruction body.
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      OUT_valid        <= 4'b0000;
      cvo_valid_ff     <= 1'b0;
      OUT_op_x64       <= '0;
    end else begin
      OUT_valid      <= 4'b0000;
      cvo_valid_ff   <= 1'b0;

      if (raw_instruction_pop_valid) begin
        // Body: bits [59:0] (opcode at [63:60] already stripped by slicing)
        OUT_op_x64.instruction <= IN_raw_instruction[`ISA_BODY_WIDTH-1:0];

        case (IN_raw_instruction[`ISA_WIDTH-1:`ISA_WIDTH-`ISA_OPCODE_WIDTH])
          OP_GEMV:   OUT_valid <= 4'b0001;
          OP_GEMM:   OUT_valid <= 4'b0010;
          OP_MEMCPY: OUT_valid <= 4'b0100;
          OP_MEMSET: OUT_valid <= 4'b1000;
          OP_CVO:    cvo_valid_ff <= 1'b1;
          default:   ;  // unknown opcode: drop silently
        endcase
      end
    end
  end

  // ===| Backpressure |==========================================================
  // Always ready — the frontend FIFO provides buffering; the decoder is single-cycle.
  assign OUT_fetch_PC_ready = 1'b1;

endmodule

3. Global Scheduler

Global_Scheduler.sv receives decoded instruction fields, emits per-core control μops, tracks in-flight async instructions, maintains the dependency scoreboard, and gates new dispatches when a hazard is detected.

Listing 16 hw/rtl/NPU_Controller/Global_Scheduler.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| Module: Global_Scheduler — VLIW → engine micro-op translator |============
// Purpose      : Decode 60-bit VLIW body and emit per-engine uops for
//                GEMM / GEMV / MEMCPY / MEMSET / CVO with deterministic
//                priority.
// Spec ref     : pccx v002 §3 (ISA decode), §4.2 (uop semantics).
// Clock        : clk_core @ 400 MHz.
// Reset        : rst_n_core active-low.
// Latency      : 1-cycle registered uop output after IN_*_op_x64_valid pulse.
// Throughput   : 1 uop/cycle per output channel; channels are mutually
//                exclusive in time per ISA serial-issue semantics.
// Priority     : OUT_LOAD_uop arbitration order:
//                  GEMM > GEMV > MEMCPY > CVO  (single-driver always_ff).
// Outputs      :
//   OUT_GEMM_uop / OUT_GEMV_uop / OUT_CVO_uop / OUT_mem_set_uop
//                  : registered at issue cycle, hold until next valid.
//   OUT_STORE_uop  : registered at issue; mem_dispatcher uses it to initiate
//                    result writeback after engine completion handshake.
//   OUT_sram_rd_start : one-cycle pulse on GEMM/GEMV LOAD dispatch — starts
//                       preprocess_fmap broadcast from L1 cache.
// Reset state  : all uops zeroed; OUT_sram_rd_start = 0.
// Errors       : none surfaced (out-of-range fields wrap to MEMCPY default).
// Counters     : none.
// Assertions   : (Stage C) exactly-zero-or-one IN_*_op_x64_valid per cycle;
//                OUT_sram_rd_start is one-cycle pulse only.
// ===============================================================================

module Global_Scheduler #() (
    input logic clk_core,
    input logic rst_n_core,

    // ===| From ctrl_npu_decoder |===============================================
    input logic IN_GEMV_op_x64_valid,
    input logic IN_GEMM_op_x64_valid,
    input logic IN_memcpy_op_x64_valid,
    input logic IN_memset_op_x64_valid,
    input logic IN_cvo_op_x64_valid,

    input instruction_op_x64_t instruction,

    // ===| Engine micro-ops |====================================================
    output gemm_control_uop_t   OUT_GEMM_uop,
    output GEMV_control_uop_t   OUT_GEMV_uop,
    output memory_control_uop_t OUT_LOAD_uop,
    output memory_control_uop_t OUT_STORE_uop,
    output memory_set_uop_t     OUT_mem_set_uop,
    output cvo_control_uop_t    OUT_CVO_uop,

    // ===| Datapath control |====================================================
    output logic OUT_sram_rd_start   // pulse: start fmap cache broadcast
);

  // ===| Combinational instruction body casts |==================================
  GEMV_op_x64_t   GEMV_op_x64;
  GEMM_op_x64_t   GEMM_op_x64;
  memcpy_op_x64_t memcpy_op_x64;
  memset_op_x64_t memset_op_x64;
  cvo_op_x64_t    cvo_op_x64;

  always_comb begin
    GEMV_op_x64   = GEMV_op_x64_t'(instruction.instruction);
    GEMM_op_x64   = GEMM_op_x64_t'(instruction.instruction);
    memcpy_op_x64 = memcpy_op_x64_t'(instruction.instruction);
    memset_op_x64 = memset_op_x64_t'(instruction.instruction);
    cvo_op_x64    = cvo_op_x64_t'(instruction.instruction);
  end

  // ===| MEMSET uop |============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_mem_set_uop <= '0;
    end else if (IN_memset_op_x64_valid) begin
      OUT_mem_set_uop <= '{
          dest_cache : dest_cache_e'(memset_op_x64.dest_cache),
          dest_addr  : memset_op_x64.dest_addr,
          a_value    : memset_op_x64.a_value,
          b_value    : memset_op_x64.b_value,
          c_value    : memset_op_x64.c_value
      };
    end
  end

  // ===| MEMCPY route translation ===============================================
  // from_device/to_device (1-bit each) → data_route_e (8-bit enum)
  data_route_e memcpy_route;
  always_comb begin
    if (memcpy_op_x64.from_device == FROM_HOST && memcpy_op_x64.to_device == TO_NPU)
      memcpy_route = from_host_to_L2;
    else
      memcpy_route = from_L2_to_host;
  end

  // ===| LOAD uop — single driver (priority: GEMM > GEMV > MEMCPY > CVO) |======
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_LOAD_uop      <= '0;
      OUT_sram_rd_start <= 1'b0;
    end else begin
      OUT_sram_rd_start <= 1'b0;   // default: no pulse

      if (IN_GEMM_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMM,
            dest_addr      : '0,
            src_addr       : GEMM_op_x64.src_addr,
            shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_GEMV_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMV,
            dest_addr      : '0,
            src_addr       : GEMV_op_x64.src_addr,
            shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_memcpy_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : memcpy_route,
            dest_addr      : memcpy_op_x64.dest_addr,
            src_addr       : memcpy_op_x64.src_addr,
            shape_ptr_addr : memcpy_op_x64.shape_ptr_addr,
            async          : memcpy_op_x64.async
        };

      end else if (IN_cvo_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_CVO,
            dest_addr      : '0,
            src_addr       : cvo_op_x64.src_addr,
            shape_ptr_addr : '0,
            async          : cvo_op_x64.async
        };
      end
    end
  end

  // ===| STORE uop — latched at issue time |=====================================
  // Held until the engine signals completion (external handshake, not shown here).
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_STORE_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMM_res_to_L2,
          dest_addr      : GEMM_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMV_res_to_L2,
          dest_addr      : GEMV_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_cvo_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_CVO_res_to_L2,
          dest_addr      : cvo_op_x64.dst_addr,
          src_addr       : '0,
          shape_ptr_addr : '0,
          async          : cvo_op_x64.async
      };
    end
  end

  // ===| GEMM uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMM_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_GEMM_uop <= '{
          flags         : GEMM_op_x64.flags,
          size_ptr_addr : GEMM_op_x64.size_ptr_addr,
          parallel_lane : GEMM_op_x64.parallel_lane
      };
    end
  end

  // ===| GEMV uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMV_uop <= '0;
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_GEMV_uop <= '{
          flags         : GEMV_op_x64.flags,
          size_ptr_addr : GEMV_op_x64.size_ptr_addr,
          parallel_lane : GEMV_op_x64.parallel_lane
      };
    end
  end

  // ===| CVO uop |===============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_CVO_uop <= '0;
    end else if (IN_cvo_op_x64_valid) begin
      OUT_CVO_uop <= '{
          cvo_func : cvo_func_e'(cvo_op_x64.cvo_func),
          src_addr : cvo_op_x64.src_addr,
          dst_addr : cvo_op_x64.dst_addr,
          length   : cvo_op_x64.length,
          flags    : cvo_flags_t'(cvo_op_x64.flags),
          async    : cvo_op_x64.async
      };
    end
  end

endmodule

Last verified against

Current public pccx-FPGA-NPU-LLM-kv260 main clone used by the documentation CI. Controller source references should stay aligned with files present in that public RTL tree.

See also

Per-Instruction Dataflow — dependency and completion tracking.