Abstractions / HDL

The NaxRiscv implementation takes advantage of quite a few paradigms available when using the SpinalHDL (a Scala hardware description library).

Framework

The toplevel of NaxRiscv is mostly an empty Component with a framework which can schedule a list of plugins. The framework itself does not create any hardware.

Here is the NaxRiscv toplevel

class NaxRiscv(xlen : Int,
                           plugins : Seq[Plugin]) extends Component{
  NaxScope.create(xlen = xlen) //Will come back on that line later
  val framework = new Framework(plugins)
}

To give an overview of how much the design is split between plugins, here is the list of them for one functional CPU :

val plugins = ArrayBuffer[Plugin]()
plugins += new DocPlugin()
plugins += new MmuPlugin(
  spec    = MmuSpec.sv32,
  ioRange = ioRange,
  fetchRange = fetchRange
)

//FETCH
plugins += new FetchPlugin()
plugins += new PcPlugin(resetVector)
plugins += new FetchCachePlugin(
  cacheSize = 4096*4,
  wayCount = 4,
  injectionAt = 2,
  fetchDataWidth = 64,
  memDataWidth = 64,
  reducedBankWidth = false,
  hitsWithTranslationWays = true,
  translationStorageParameter = MmuStorageParameter(
    levels   = List(
      MmuStorageLevel(
        id    = 0,
        ways  = 4,
        depth = 32
      ),
      MmuStorageLevel(
        id    = 1,
        ways  = 2,
        depth = 32
      )
    ),
    priority = 0
  ),
  translationPortParameter  = MmuPortParameter(
    readAt = 1,
    hitsAt = 1,
    ctrlAt = 1,
    rspAt  = 1
  )
)
plugins += new AlignerPlugin(
  decodeCount = 2,
  inputAt = 2
)

//FRONTEND
plugins += new FrontendPlugin()
plugins += new DecompressorPlugin()
plugins += new DecoderPlugin()
plugins += new RfTranslationPlugin()
plugins += new RfDependencyPlugin()
plugins += new RfAllocationPlugin(riscv.IntRegFile)
plugins += new DispatchPlugin(
  slotCount = 32
)

//BRANCH PREDICTION
plugins += new BranchContextPlugin(
  branchCount = 16
)
plugins += new HistoryPlugin(
  historyFetchBypass = true
)
plugins += new DecoderPredictionPlugin(
  flushOnBranch = false //TODO remove me (DEBUG)
)
plugins += new BtbPlugin(
  entries = 512,
  readAt = 0,
  hitAt = 1,
  jumpAt = 1
)
plugins += new GSharePlugin(
  memBytes = 4 KiB,
  historyWidth = 24,
  readAt = 0
)

//LOAD / STORE
plugins += new LsuPlugin(
  lqSize = 16,
  sqSize = 16,
  loadToCacheBypass = true,
  lqToCachePipelined = true,
  hitPedictionEntries = 1024,
  translationStorageParameter = MmuStorageParameter(
    levels   = List(
      MmuStorageLevel(
        id    = 0,
        ways  = 4,
        depth = 32
      ),
      MmuStorageLevel(
        id    = 1,
        ways  = 2,
        depth = 32
      )
    ),
    priority = 1
  ),
  loadTranslationParameter  = MmuPortParameter(
    readAt = 0,
    hitsAt = 1,
    ctrlAt = 1,
    rspAt  = 1
  ),
  storeTranslationParameter = MmuPortParameter(
    readAt = 1,
    hitsAt = 1,
    ctrlAt = 1,
    rspAt  = 1
  )
)
plugins += new DataCachePlugin(
  memDataWidth = 64,
  cacheSize    = 4096*4,
  wayCount     = 4,
  refillCount = 2,
  writebackCount = 2,
  tagsReadAsync = true,
  reducedBankWidth = false,
  loadRefillCheckEarly = false
)

//MISC
plugins += new RobPlugin(
  robSize = 64,
  completionWithReg = false
)
plugins += new CommitPlugin(
  commitCount = 2,
  ptrCommitRetimed = true
)
plugins += new RegFilePlugin(
  spec = riscv.IntRegFile,
  physicalDepth = 64,
  bankCount = 1
)
plugins += new CommitDebugFilterPlugin(List(4, 8, 12))
plugins += new CsrRamPlugin()
plugins += new PrivilegedPlugin(PrivilegedConfig.full.copy(withRdTime = withRdTime))
plugins += new PerformanceCounterPlugin(
  additionalCounterCount = 4,
  bufferWidth            = 6
)

//EXECUTION UNITES
plugins += new ExecutionUnitBase("EU0")
plugins += new SrcPlugin("EU0", earlySrc = true)
plugins += new IntAluPlugin("EU0", aluStage = 0)
plugins += new ShiftPlugin("EU0" , aluStage = 0)
plugins += new BranchPlugin("EU0")

plugins += new ExecutionUnitBase("EU1")
plugins += new SrcPlugin("EU1")
plugins += new IntAluPlugin("EU1")
plugins += new ShiftPlugin("EU1")
plugins += new BranchPlugin("EU1")

plugins += new ExecutionUnitBase("EU2", writebackCountMax = 1)
plugins += new SrcPlugin("EU2", earlySrc = true)
plugins += new MulPlugin("EU2", writebackAt = 2, staticLatency = false)
plugins += new DivPlugin("EU2", writebackAt = 2)
plugins += new LoadPlugin("EU2")
plugins += new StorePlugin("EU2")
plugins += new EnvCallPlugin("EU2")(rescheduleAt = 2)
plugins += new CsrAccessPlugin("EU2")(
  decodeAt = 0,
  readAt = 1,
  writeAt = 2,
  writebackAt = 2,
  staticLatency = false
)

Each of those plugins may :

  • Implement services used by other plugins (ex : provide jump interfaces, provide rescheduling interface, provide a pipeline skeleton)

  • Use other plugins functionalities

  • Create hardware

  • Create early tasks (used to setup things between plugins)

  • Create late tasks (used in general to create the required hardware)

Plugin tasks

Here is can instance of dummy plugin creating two tasks (setup / logic):

class DummyPlugin extends Plugin {
  val setup = create early new Area {
    //Here you can setup things with other plugins
    //This code will always run before any late tasks
  }

  val logic = create late new Area {
    //Here you can (for instance) generate hardware
    //This code will always start after any early task
  }
}

Note that create early and create late will execute their code in a new threads, which are scheduled by the Framework class.

Service definition

For instance, the JumpService, providing a hardware jump interface to other plugins. Such a service can be defined as :

//Software interface (elaboration time)
trait JumpService extends Service{
  def createJumpInterface(priority : Int) : Flow[JumpCmd]
}

//Hardware payload of the interface
case class JumpCmd(pcWidth : Int) extends Bundle{
  val pc = UInt(pcWidth bits)
}

Service implementation

Taking the previously shown JumpService, the PcPlugin could implement it the following way :

case class JumpSpec(interface :  Flow[JumpCmd], priority : Int)
class PcPlugin() extends Plugin with JumpService{
  val jumpsSpec = ArrayBuffer[JumpSpec]()

  override def createJumpInterface(priority : Int): Flow[JumpCmd] = {
    val spec = JumpSpec(Flow(JumpCmd(32)), priority)
    jumpsSpec += spec
    return spec.interface
  }

  val logic = create late new Area{
    //Here, implement the PC logic and manage the jumpsSpec interfaces
        val pc = Reg(UInt(32 bits))
        val sortedJumps = jumpsSpec.sortBy(_.priority) //Lower priority first
        for(jump <- sortedJumps){
              when(jump.interface.valid){
                pc := jump.interface.pc
              }
        }
        ...
  }
}

Service usage

Another plugin could then retrieve and use this service by :

class AnotherPlugin() extends Plugin {
  val setup = create early new Area {
    val jump = getService[JumpService].createJumpInterface(42)
  }

  val logic = create late new Area {
    setup.jump.valid := ???
    setup.jump.pc := ???
  }
}

Service Pipeline definition

Some plugins may even create a pipeline skeleton which can then be populated by other plugins. For instance :

class FetchPlugin() extends Plugin with LockedImpl {
  val pipeline = create early new Pipeline{
    val stagesCount = 2
    val stages = Array.fill(stagesCount)(newStage())

    import spinal.lib.pipeline.Connection._
    //Connect every stage together
    for((m, s) <- (stages.dropRight(1), stages.tail).zipped){
      connect(m, s)(M2S())
    }
  }

  val logic = create late new Area{
    lock.await() //Allow other plugins to make this blocking until they specified everything they wanted in the pipeline stages.
    pipeline.build()
  }
}

Service Pipeline usage

For instance, the PcPlugin will want to introduce the PC value into the fetch pipeline :

object PcPlugin extends AreaObject{
  val FETCH_PC = Stageable(UInt(32 bits))  //Define the concept of a FETCH_PC signal being usable through a pipeline
}

class PcPlugin() extends Plugin with ...{

  val setup = create early new Area{
    getService[FetchPlugin].retain() //We need to hold the FetchPlugin logic task until we create all the associated accesses
  }

  val logic = create late new Area{
    val fetch = getService[FetchPlugin]
    val firstStage = fetch.pipeline.stages(0)

    firstStage(PcPlugin.FETCH_PC) := ???   //Assign the FETCH_PC value in firstStage of the pipeline. Other plugins may access it down stream.
    fetch.release()
  }
}

Execution units

Implementation of the execution units is another practical use of this concept. You can spawn an execution unit by creating a new ExecutionUnitBase with a unique execution unit identifier :

plugins += new ExecutionUnitBase("EU0")

Then you can populate that execution unit by adding new ExecutionUnitElementSimple with the same identifier :

plugins += new SrcPlugin("EU0")
plugins += new IntAluPlugin("EU0")
plugins += new ShiftPlugin("EU0")

Here is the example of an execution unit handling :

  • mul/div

  • jump/branches

  • load/store

  • CSR accesses

  • ebreak/ecall/mret/wfi

plugins += new ExecutionUnitBase("EU1", writebackCountMax = 1)
plugins += new SrcPlugin("EU1")
plugins += new MulPlugin("EU1", writebackAt = 2, staticLatency = false)
plugins += new DivPlugin("EU1", writebackAt = 2)
plugins += new BranchPlugin("EU1", writebackAt = 2, staticLatency = false)
plugins += new LoadPlugin("EU1")
plugins += new StorePlugin("EU1")
plugins += new CsrAccessPlugin("EU1")(
  decodeAt = 0,
  readAt = 1,
  writeAt = 2,
  writebackAt = 2,
  staticLatency = false
)
plugins += new EnvCallPlugin("EU1")(rescheduleAt = 2)

ShiftPlugin

Here is the ShiftPlugin as an example of ExecutionUnitElementSimple plugin:

object ShiftPlugin extends AreaObject {
  val SIGNED = Stageable(Bool())
  val LEFT = Stageable(Bool())
}

class ShiftPlugin(euId : String, staticLatency : Boolean = true, aluStage : Int = 0) extends ExecutionUnitElementSimple(euId, staticLatency) {
  import ShiftPlugin._

  override def euWritebackAt = aluStage

  override val setup = create early new Setup{
    import SrcKeys._

    add(Rvi.SLL , List(SRC1.RF, SRC2.RF), DecodeList(LEFT -> True,  SIGNED -> False))
    add(Rvi.SRL , List(SRC1.RF, SRC2.RF), DecodeList(LEFT -> False, SIGNED -> False))
    add(Rvi.SRA , List(SRC1.RF, SRC2.RF), DecodeList(LEFT -> False, SIGNED -> True))
    add(Rvi.SLLI, List(SRC1.RF, SRC2.I ), DecodeList(LEFT -> True , SIGNED -> False))
    add(Rvi.SRLI, List(SRC1.RF, SRC2.I ), DecodeList(LEFT -> False, SIGNED -> False))
    add(Rvi.SRAI, List(SRC1.RF, SRC2.I ), DecodeList(LEFT -> False, SIGNED -> True))
  }

  override val logic = create late new Logic{
    val process = new ExecuteArea(aluStage) {
      import stage._
      val ss = SrcStageables

      assert(Global.XLEN.get == 32)
      val amplitude  = ss.SRC2(4 downto 0).asUInt
      val reversed   = Mux[SInt](LEFT, ss.SRC1.reversed, ss.SRC1)
      val shifted = (S((SIGNED & ss.SRC1.msb) ## reversed) >> amplitude).resize(Global.XLEN bits)
      val patched = LEFT ? shifted.reversed | shifted

      wb.payload := B(patched)
    }
  }
}

Pipeline

To allow the definition of extendable/flexible pipelines, the Pipeline abstraction was put into place. This abstraction allows to define stages, arbitrations, connections, and values connected through them.

Here is a simple example :

new Pipeline{
       val fetch = newStage()
       val decoded = newStage()
       val execute = newStage()
       val memory = newStage()
       val writeback = newStage()

       connect(fetch, decoded)(M2S())
       connect(decoded, execute)(M2S())
       connect(execute, memory)(M2S())
       connect(memory, writeback)(M2S())

       val PC = Stageable(UInt(32 bits)) //This isn't a hardware signal, but it is a "key" used to identify the concept of PC (program counter) in the whole pipeline.
       fetch(PC) := xxx      //Assign xxx to the fetch(PC) value
       yyy := writeback(PC)  //Assign the writeback(PC) value to yyy

       execute.haltWhen(execute(PC) === zzz) //Halt the pipeline at the execute stage when the eecute(PC) match zzz

       build()  //Generate all the required hardware. This will for instance pipeline the PC from the fetch stage to where it is needed (execute/writeback stage)
 }

Based on that API, multiple plugins in the NaxRiscv CPU can compose / extend existing pipelines. Also notes that some plugins maybe put into place as a skeleton pipeline for other plugin to work with. This was done for the FetchPlugin, FrontendPlugin and the ExecutionUnitBase.

This API, combined with the concurrent hardware elaboration of the plugins, also allows to adjust the number of stages in the pipeline depending on the plugins’ needs, as it is done for the ExecutionUnitBase plugin.

To be more concrete, execution unit plugins using an ExecutionUnitBase as a pipeline skeleton only have to refer to a given stage number for it to be dynamically created (if it wasn’t already created before).

Another function of the pipeline API is that it allows identifying pipeline elements with a secondary key. For instance, in the following example some logic which could be used to calculate the next PC of a branch in a pipelined way.

val PC = Stageable(UInt(32 bits))
val COND = Stageable(UInt(32 bits))
stageA(COND) := xxx
stageA(PC, "WITHOUT_BRANCH") := stageA(PC) + 4 //Using the string "WITHOUT_BRANCH" as a secondary key
stageA(PC, "WITH_BRANCH") := stageA(PC) + offset
when(stageB(COND)){
  stageB(PC, "NEXT") := stageB(PC, "WITH_BRANCH") ohterwise stageB(PC, "WITHOUT_BRANCH")
} otherwise {
  stageB(PC, "NEXT") := stageB(PC, "WITHOUT_BRANCH")
}

But more generally, this secondary key is used to access a “dimension”. For instance, the pipeline being used to decode two instruction at the time :

val OPCODE = Stageable(Bits(32 bits))
for(decodeIndex <- 0 until decodeCount){
   decodeStage(OPCODE, decodeIndex) := xxx   //Using the index of the decoding unit we are working on as a secondary key
}

State machine API

Not something ground breaking, but in a few places, the SpinalHDL state machine API is used. Here is a short example of the API :

val fsm = new StateMachine{
   val stateA = new State with EntryPoint
   val stateB = new State
   val stateC = new State
   val counter = Reg(UInt(8 bits)) init (0)

   stateA.whenIsActive (goto(stateB))

   stateB.onEntry(counter := 0)
   stateB.whenIsActive {
       counter := counter + 1
         when(counter === 4){
           goto(stateC)
         }
   }
   stateB.onExit(io.result := True)

   stateC.whenIsActive (goto(stateA))
 }

Automated multiport memory transformation

In quite a few places in the design, there are memories which need multiple write ports. Such memories are not directly inferable by the FPGA synthesis tools most of the time.

Still the NaxRiscv Scala code defines them all over the place, and to fill this gap, a custom SpinalHDL transformation phase was added to simplify them into groups of simple dual port ram with xor based glue.

Here is how this transformation phase is added into the flow :

val spinalConfig = SpinalConfig()
spinalConfig.addTransformationPhase(new MultiPortWritesSymplifier)
spinalConfig.generateVerilog(new NaxRiscv(xlen = 32, plugins))

And here is how such a transformation phase is defined :

class MultiPortWritesSymplifier extends PhaseMemBlackboxing{
  override def doBlackboxing(pc: PhaseContext, typo: MemTopology) = {
        if(typo.writes.size > 1 && typo.readsSync.size == 0){
          typo.writes.foreach(w => assert(w.mask == null))
          typo.writes.foreach(w => assert(w.clockDomain == typo.writes.head.clockDomain))
          val cd = typo.writes.head.clockDomain

          import typo._

          val ctx = List(mem.parentScope.push(), cd.push())

          // RamAsyncMwXor implement multiple write ports using simple dual port rams and xors
          val c = RamAsyncMwXor(
                payloadType = Bits(mem.width bits),
                depth       = mem.wordCount,
                writePorts  = writes.size,
                readPorts   = readsAsync.size
          ).setCompositeName(mem)

          //Connect the write ports to the RamAsyncMwXor
          for((dst, src) <- (c.io.writes, writes).zipped){
                dst.valid.assignFrom(src.writeEnable)
                dst.address.assignFrom(src.address)
                dst.data.assignFrom(src.data)
          }

          //Connect the read ports to the RamAsyncMwXor
          for((reworked, old) <- (c.io.read, readsAsync).zipped){
                reworked.cmd.payload.assignFrom(old.address)
                wrapConsumers(typo, old, reworked.rsp)
          }

  //Cleanup the old memory from the netlist
          mem.removeStatement()
          mem.foreachStatements(s => s.removeStatement())

          ctx.foreach(_.restore())
        }
  }
}