Source code for hwtLib.amba.axi_comp.cache.cacheWriteAllocWawOnlyWritePropagating

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from typing import List

from hwt.code import Or, SwitchLogic, If
from hwt.code_utils import rename_signal
from hwt.hdl.types.bits import Bits
from hwt.hdl.types.defs import BIT
from hwt.hdl.types.struct import HStruct
from hwt.interfaces.hsStructIntf import HsStructIntf
from hwt.interfaces.std import Handshaked
from hwt.interfaces.structIntf import StructIntf
from hwt.interfaces.utils import addClkRstn, propagateClkRstn
from hwt.math import log2ceil, isPow2
from hwt.synthesizer.param import Param
from hwt.synthesizer.rtlLevel.rtlSignal import RtlSignal
from hwtLib.amba.axi4 import Axi4, Axi4_r, Axi4_addr, Axi4_w, Axi4_b
from hwtLib.amba.axi_comp.cache.addrTypeConfig import CacheAddrTypeConfig
from hwtLib.amba.axi_comp.cache.lru_array import AxiCacheLruArray, IndexWayHs
from hwtLib.amba.axi_comp.cache.tag_array import AxiCacheTagArray, \
    AxiCacheTagArrayLookupResIntf, AxiCacheTagArrayUpdateIntf
from hwtLib.amba.axis_comp.builder import AxiSBuilder
from hwtLib.amba.constants import RESP_OKAY, BURST_INCR, CACHE_DEFAULT, \
    LOCK_DEFAULT, BYTES_IN_TRANS, PROT_DEFAULT, QOS_DEFAULT
from hwtLib.common_nonstd_interfaces.addr_hs import AddrHs
from hwtLib.handshaked.builder import HsBuilder
from hwtLib.handshaked.reg import HandshakedReg
from hwtLib.handshaked.streamNode import StreamNode
from hwtLib.logic.binToOneHot import binToOneHot
from hwtLib.mem.ramTransactional import RamTransactional
from hwtLib.mem.ramTransactional_io import TransRamHsR, TransRamHsW
from pyMathBitPrecise.bit_utils import mask


# https://chipress.co/category/job-roles-titles/page/16/
# https://chipress.co/2019/04/13/can-you-show-the-state-transition-for-snoop-based-scheme-using-msi-protocol/
# https://github.com/airin711/Verilog-caches
# https://github.com/rajshadow/4-way-set-associative-cache-verilog
# https://github.com/xdesigns/4way-cache
# https://github.com/prasadp4009/2-way-Set-Associative-Cache-Controller
[docs]class AxiCacheWriteAllocWawOnlyWritePropagating(CacheAddrTypeConfig): """ Non-blocking pipelined Set Associative cache for AXI interfaces which is designed to work with an LSU which solves only WAW (write-after-write) conflicts. :note: Write propagation in this context means that any read received will contain lastly written data in some time few clock before (derived from read latency of the LSU) the actual request (due to latency of the read resolution). This means that if master check last N transaction for collision the data is asserted to be in last version or to be marked with an invalidation flag. The N is usually 3 and is derived from the latency of LSU which should be connected behind this cache. :attention: This cache solves only WAW conflicts, this means that WAR and RAW conflicts are left unsolved and must be handled on master side. This is suitable for a cumulative operations in general as together with write propagating it allows master component to significantly reduce buffers and collision detection logic. .. figure:: ./_static/AxiCacheWriteAllocWawOnlyWritePropagating.png :see: :class:`hwtLib.amba.axi_comp.cache.CacheAddrTypeConfig` :ivar DATA_WIDTH: data width of interfaces :ivar WAY_CNT: number of places where one cache line can be stored :note: 1-way associative = directly mapped :note: This cache does not check access collision with a requests to main (slave) memory. It only provides an information for LSU to do so. The LSU is supposed to be connected between main memory and this cache (= on master port where slave should be connected). * The tag_array contains tags and cache line status flags for cache lines. * The lsu_array contains the data for data for pseudo LRU (Last Recently Used) cache replacement policy. It is stored in a separate array due to high requirements for concurrent access which results in increased memory consumption. * The data_array is a RAM where data for cache lines is stored. The memories are separated because they have a different memory port requirements and we want to keep the number of memory ports and the size of the memory minimal as resource requirements grow exponentially with increasing number of memory ports. .. hwt-autodoc:: _example_AxiCacheWriteAllocWawOnlyWritePropagating """ def _config(self): Axi4._config(self) self.WAY_CNT = Param(4) self.MAX_BLOCK_DATA_WIDTH = Param(None) self.IS_PREINITIALIZED = Param(False) CacheAddrTypeConfig._config(self) def _declr(self): assert self.CACHE_LINE_CNT > 0, self.CACHE_LINE_CNT assert self.WAY_CNT > 0 and isPow2(self.WAY_CNT), self.WAY_CNT assert self.CACHE_LINE_CNT % self.WAY_CNT == 0, (self.CACHE_LINE_CNT, self.WAY_CNT) assert isPow2(self.CACHE_LINE_SIZE // (self.DATA_WIDTH // 8)) assert self.DATA_WIDTH % 8 == 0, self.DATA_WIDTH self._compupte_tag_index_offset_widths() addClkRstn(self) with self._paramsShared(): self.s = Axi4() self.m = Axi4()._m() rc = self.read_cancel = AddrHs()._m() rc.ID_WIDTH = 0 self.tag_array = AxiCacheTagArray() self.lru_array = AxiCacheLruArray() for a in [self.tag_array, self.lru_array]: a.PORT_CNT = 2 # r+w da = RamTransactional() da.MAX_BLOCK_DATA_WIDTH = self.MAX_BLOCK_DATA_WIDTH da.WORD_WIDTH = self.CACHE_LINE_SIZE * 8 da.DATA_WIDTH = self.DATA_WIDTH da.ADDR_WIDTH = log2ceil(self.CACHE_LINE_CNT) da.R_ID_WIDTH = self.ID_WIDTH da.W_PRIV_T = HStruct( # used to construct an address for flush of original item in cache which is beeing replaced (Bits(self.TAG_W), "victim_tag"), # index part of address is an address on flush_data.addr channel (Bits(self.ID_WIDTH), "id"), ) self.data_array = da # self.flush = HandshakeSync() # self.init = HandshakeSync()
[docs] def axiAddrDefaults(self, a: Axi4_addr): a.burst(BURST_INCR) a.len(self.CACHE_LINE_SIZE // (self.DATA_WIDTH // 8) - 1) a.cache(CACHE_DEFAULT) a.lock(LOCK_DEFAULT) a.size(BYTES_IN_TRANS(self.DATA_WIDTH // 8)) a.prot(PROT_DEFAULT) a.qos(QOS_DEFAULT)
[docs] def connect_tag_lookup(self, init_in_progress: RtlSignal): in_ar, in_aw = self.s.ar, self.s.aw # connect address lookups to a tag array tags = self.tag_array for a, tag_lookup in zip((in_ar, in_aw), tags.lookup): tag_lookup.addr(a.addr) tag_lookup.id(a.id) if a is in_aw: rc = self.read_cancel rc.addr(a.addr) StreamNode([a], [tag_lookup, rc]).sync(~init_in_progress) else: StreamNode([a], [tag_lookup]).sync(~init_in_progress)
[docs] def incr_lru_on_hit(self, lru_incr: IndexWayHs, tag_res: AxiCacheTagArrayLookupResIntf): index = self.parse_addr(tag_res.addr)[1] lru_incr.vld(tag_res.vld & tag_res.found) lru_incr.way(tag_res.way) lru_incr.index(index)
[docs] def read_handler(self, ar_tagRes: AxiCacheTagArrayLookupResIntf, # in axi_s_r: Axi4_r, # out ar_lru_incr: IndexWayHs, # out da_r: TransRamHsR, # in axi_m_ar: Axi4_addr, # out axi_m_r: Axi4_r # in ): """ :param ar_tagRes: Read request including information from tag_array for given tag. :param axi_s_r: Read data requested by ar_tagRes. :param ar_lru_incr: Incrementing LRU for given address when tag is found. :param da_r: Read interface of data_array used when tag is found. :param axi_m_ar: Read address request interface to memory when tag is not found. :param axi_m_r: Read data requested by axi_m_ar from memory when tag is not found. .. figure:: ./_static/AxiCacheWriteAllocWawOnlyWritePropagating_read_handler.png """ self.incr_lru_on_hit(ar_lru_incr, ar_tagRes) # addd a register with backup register for poential overflow # we need this as we need to check if we can store data in advance. # this is because we need a higher priority for flushing # in order to avoid deadlock. data_arr_read_req = HsBuilder(self, da_r.addr, master_to_slave=False)\ .buff(1, latency=(1, 2))\ .end # send read request to data_array ar_index = self.parse_addr(ar_tagRes.addr)[1] data_arr_read_req.priv(ar_tagRes.id) data_arr_read_req.addr(self.addr_in_data_array(ar_tagRes.way, ar_index)), # delegate read request to m.ar if not hit StreamNode( [ar_tagRes], [axi_m_ar, data_arr_read_req], extraConds={ axi_m_ar: ar_tagRes.vld & ~ar_tagRes.found, data_arr_read_req: ar_tagRes.vld & ar_tagRes.found, }, skipWhen={ axi_m_ar: ar_tagRes.vld & ar_tagRes.found, data_arr_read_req: ar_tagRes.vld & ~ar_tagRes.found, }, ).sync() axi_m_ar.addr(ar_tagRes.addr) axi_m_ar.id(ar_tagRes.id) self.axiAddrDefaults(axi_m_ar) data_arr_read = axi_s_r.__class__() data_arr_read._updateParamsFrom(axi_s_r) self.data_arr_read = data_arr_read data_arr_read(da_r.data, exclude=[data_arr_read.resp]) data_arr_read.resp(RESP_OKAY) data_arr_read = AxiSBuilder(self, data_arr_read)\ .buff(1, latency=(1, 2))\ .end s_r = AxiSBuilder.join_prioritized(self, [ data_arr_read, axi_m_r, ]).end axi_s_r(s_r)
[docs] def resolve_victim(self, st0_o_tag_found: RtlSignal, # in st0_o_found_way: RtlSignal, # in st0_o_tags: List[StructIntf], # in victim_way: Handshaked # in ): _victim_way = self._sig("victim_way_tmp", Bits(log2ceil(self.WAY_CNT))) _victim_tag = self._sig("victim_tag_tmp", Bits(self.TAG_W)) SwitchLogic( [ # select first empty tag (~tag.valid, [ _victim_way(i), _victim_tag(tag.tag), ]) for i, tag in enumerate(st0_o_tags) ], default=[ # select an victim specified by victim_way _victim_way(victim_way.data), SwitchLogic([ (victim_way.data._eq(i), _victim_tag(tag.tag)) for i, tag in enumerate(st0_o_tags) ], default=_victim_tag(None) ) ] ) _victim_way = st0_o_tag_found._ternary(st0_o_found_way, _victim_way) return _victim_way, _victim_tag
[docs] def write_handler(self, aw_tagRes: AxiCacheTagArrayLookupResIntf, # in axi_s_b: Axi4_b, # out aw_lru_incr: IndexWayHs, # out victim_way_req: AddrHs, victim_way_resp: Handshaked, # out, in da_w: TransRamHsW, # in tag_update: AxiCacheTagArrayUpdateIntf, # out init_in_progress: RtlSignal, # in ): """ :param aw_tagRes: Write request including in information from tag_array for given tag. :param axi_s_b: Response requested by aw_tagRes :param aw_lru_incr: Incrementing LRU for given address when tag is found. :param victim_way_req: Request victim from LRU array for a specified index, when cache is full. :param victim_way_resp: Victim address requested by victim_way_req :param da_w: Write interface of data_array to write and initiate flush when cache is full. :param tag_update: Tag update interface for newly written data. .. figure:: ./_static/AxiCacheWriteAllocWawOnlyWritePropagating_write_handler.png """ # note that the lru update happens even if the data is stalled # but that is not a problem because it wont change the order of the usage # of the cahceline self.incr_lru_on_hit(aw_lru_incr, aw_tagRes) st0 = HandshakedReg(HsStructIntf) st0.T = HStruct( # the original id and address of a write transaction (self.s.aw.id._dtype, "write_id"), (self.s.aw.addr._dtype, "replacement_addr"), # array of tags for cachelines with this index (aw_tagRes.TAG_T[aw_tagRes.WAY_CNT], "tags"), (BIT, "tag_found"), (BIT, "had_empty"), # had some empty tag (aw_tagRes.way._dtype, "found_way"), # way of where tag was found ) self.victim_load_status0 = st0 st0_i = st0.dataIn.data # resolve if we need to select a victim and optianally ask for it has_empty = rename_signal(self, Or(*(~t.valid for t in aw_tagRes.tags)), "has_empty") st0_i.write_id(aw_tagRes.id), st0_i.replacement_addr(aw_tagRes.addr), st0_i.tags(aw_tagRes.tags), st0_i.tag_found(aw_tagRes.found), st0_i.found_way(aw_tagRes.way), st0_i.had_empty(has_empty), victim_way_req.addr(self.parse_addr(aw_tagRes.addr)[1]) StreamNode( [aw_tagRes], [victim_way_req, st0.dataIn], skipWhen={ victim_way_req: aw_tagRes.vld & ( aw_tagRes.found | has_empty ) }, extraConds={ victim_way_req:~aw_tagRes.found & ~has_empty } ).sync() ########################## st1 - pre (read request resolution, victim address resolution) ############## st0_o = st0.dataOut.data _victim_way, _victim_tag = self.resolve_victim(st0_o.tag_found, st0_o.found_way, st0_o.tags, victim_way_resp) da_w.addr.flush(rename_signal(self, st0.dataOut.vld & (~st0_o.had_empty & ~st0_o.tag_found), "need_to_flush")) da_w.addr.priv.id(st0_o.write_id) da_w.addr.addr(self.addr_in_data_array(st0_o.tag_found._ternary(st0_o.found_way, _victim_way), self.parse_addr(st0_o.replacement_addr)[1])), da_w.addr.priv.victim_tag(_victim_tag) MULTI_WORD = self.data_array.ITEM_WORDS > 1 if MULTI_WORD: st1_id = HandshakedReg(Handshaked) st1_id.LATENCY = (1, 2) st1_id.DATA_WIDTH = self.ID_WIDTH self.victim_load_status1 = st1_id st1_id.dataIn.data(st0_o.write_id) # placed between st0, st1 StreamNode( [victim_way_resp, st0.dataOut], [da_w.addr, st1_id.dataIn] if MULTI_WORD else [da_w.addr], extraConds={ victim_way_resp:~st0_o.tag_found & ~st0_o.had_empty, }, skipWhen={ victim_way_resp: st0_o.tag_found | st0_o.had_empty, } ).sync() in_w = AxiSBuilder(self, self.s.w)\ .buff(self.tag_array.LOOKUP_LATENCY + 4)\ .end if MULTI_WORD: StreamNode( [in_w, st1_id.dataOut], [da_w.data, axi_s_b], extraConds={axi_s_b: in_w.valid & in_w.last, st1_id.dataOut: in_w.valid & in_w.last}, skipWhen={axi_s_b: in_w.valid & ~in_w.last, st1_id.dataOut: in_w.valid & ~in_w.last}, ).sync(~init_in_progress) axi_s_b.id(st1_id.dataOut.data) # todo else: StreamNode( [in_w], [da_w.data, axi_s_b], extraConds={axi_s_b: in_w.valid & in_w.last}, skipWhen={axi_s_b:in_w.valid & ~in_w.last}, ).sync(~init_in_progress) axi_s_b.id(st0_o.write_id) axi_s_b.resp(RESP_OKAY) da_w.data(in_w, exclude=[in_w.ready, in_w.valid]) lru_array_set = self.lru_array.set init_cntr = self._reg("init_cntr", lru_array_set.addr._dtype, def_val=0) If(init_in_progress, tag_update.vld(1), tag_update.delete(1), tag_update.way_en(mask(tag_update.way_en._dtype.bit_length())), tag_update.addr(self.tag_array.deparse_addr(0, init_cntr, 0)), lru_array_set.addr(init_cntr), lru_array_set.data(0), lru_array_set.vld(1), If(init_cntr._eq(mask(init_cntr._dtype.bit_length())), init_cntr(0), init_in_progress(0), ).Else( init_cntr(init_cntr + 1), ) ).Else( tag_update.vld(st0.dataOut.vld & da_w.addr.rd), tag_update.delete(0), tag_update.way_en(binToOneHot(_victim_way)), tag_update.addr(st0_o.replacement_addr), lru_array_set.addr(None), lru_array_set.data(None), lru_array_set.vld(0), )
[docs] def flush_handler(self, flush_data: TransRamHsW, # in axi_m_aw: Axi4_addr, # out axi_m_w: Axi4_w, # out axi_m_b: Axi4_b, # in ): id_tag = flush_data.addr.priv # potentially cut msb bits which do specify the way from address axi_m_aw.addr(self.deparse_addr(id_tag.victim_tag, flush_data.addr.addr[self.INDEX_W:], 0)) axi_m_aw.id(id_tag.id) self.axiAddrDefaults(axi_m_aw) StreamNode( [flush_data.addr, ], [axi_m_aw, ] ).sync() axi_m_w.data(flush_data.data.data) axi_m_w.strb(mask(axi_m_w.data._dtype.bit_length() // 8)) axi_m_w.last(flush_data.data.last) StreamNode( [flush_data.data], [axi_m_w], ).sync() axi_m_b.ready(1)
[docs] def _impl(self): """ Read operation: * Use index to lookup in tag memory * if tag matches return cacheline else dispatch read request (the transaction is dispatched with original id, upon data receive the transaction is passed to master without any synchronization with the cache ) Write operation: * Use index to lookup in tag memory * If tag matches and the cacheline is not being replaced update the data in data array. * If tag is not found in corresponding set select a victim and read it from data array, flush it and write back cacheline to array and update tag """ # transaction type usind in data array memory access pipeline init_in_progress = self._reg("init_in_progress", def_val=int(not self.IS_PREINITIALIZED)) self.connect_tag_lookup(init_in_progress) ar_tagRes, aw_tagRes = self.tag_array.lookupRes self.read_handler( ar_tagRes, self.s.r, self.lru_array.incr[0], self.data_array.r, self.m.ar, self.m.r, ) self.write_handler( aw_tagRes, self.s.b, self.lru_array.incr[1], self.lru_array.victim_req, self.lru_array.victim_data, self.data_array.w, self.tag_array.update[0], init_in_progress, ) self.flush_handler( self.data_array.flush_data, self.m.aw, self.m.w, self.m.b, ) propagateClkRstn(self)
[docs]def _example_AxiCacheWriteAllocWawOnlyWritePropagating(): u = AxiCacheWriteAllocWawOnlyWritePropagating() u.DATA_WIDTH = 16 u.CACHE_LINE_SIZE = 2 u.WAY_CNT = 2 u.CACHE_LINE_CNT = 16 u.MAX_BLOCK_DATA_WIDTH = 8 return u
if __name__ == "__main__": from hwt.synthesizer.utils import to_rtl_str from hwtLib.xilinx.constants import XILINX_VIVADO_MAX_DATA_WIDTH u = AxiCacheWriteAllocWawOnlyWritePropagating() u.DATA_WIDTH = 512 u.CACHE_LINE_SIZE = u.DATA_WIDTH // 8 u.WAY_CNT = 4 u.CACHE_LINE_CNT = u.WAY_CNT * 4096 u.MAX_BLOCK_DATA_WIDTH = XILINX_VIVADO_MAX_DATA_WIDTH # u.CACHE_LINE_SIZE = 64 # u.DATA_WIDTH = 512 # u.WAY_CNT = 2 # u.CACHE_LINE_CNT = u.WAY_CNT * 4096 # u = _example_AxiCacheWriteAllocWawOnlyWritePropagating() print(to_rtl_str(u))