This commit is contained in:
hiGepi 2022-11-18 15:07:43 +01:00
parent 7edaf0f745
commit 1b032d43db
185 changed files with 56362 additions and 0 deletions

Binary file not shown.

Binary file not shown.

BIN
T1/TP/TP1/HPL-2009-85.pdf Normal file

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,194 @@
# Cache size
//-size (bytes) 528
//-size (bytes) 4096
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
//-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 268435456
//-size (bytes) 536870912
//-size (bytes) 67108864
//-size (bytes) 536870912
//-size (bytes) 1073741824
# For 3D DRAM memory please use Gb as units
-size (Gb) 2
# Line size
//-block size (bytes) 8
-block size (bytes) 128
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
-associativity 1
//-associativity 16
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 16
//-technology (u) 0.032
//-technology (u) 0.040
//-technology (u) 0.065
//-technology (u) 0.078
-technology (u) 0.080
# following three parameters are meaningful only for main memories
//-page size (bits) 8192
-burst length 4
-internal prefetch width 1
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
//-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
-Data array cell type - "comm-dram"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
//-Data array peripheral type - "itrs-hp"
-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
//-output/input bus width 64
-output/input bus width 64
// 300-400 in steps of 10
-operating temperature (K) 350
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
//-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
-cache type "3D memory or 2D main memory"
# Parameters for 3D DRAM
//-page size (bits) 16384
-page size (bits) 8192
//-page size (bits) 4096
-burst depth 4
-IO width 4
-system frequency (MHz) 266
-stacked die count 1
-partitioning granularity 0 // 0: coarse-grained rank-level; 1: fine-grained rank-level
//-TSV projection 1 // 0: ITRS aggressive; 1: industrial conservative
## End of parameters for 3D DRAM
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 45
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
-access mode (normal, sequential, fast) - "fast"
//-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:0:100
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 50:100000:100000:100000:1000000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_5"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
-Wire outside mat - "global"
//-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "false"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
-Force cache config - "true"
//-Force cache config - "false"
-Ndwl 128
-Ndbl 32
-Nspd 1
-Ndcm 1
-Ndsam1 1
-Ndsam2 1

View file

@ -0,0 +1,194 @@
# Cache size
//-size (bytes) 528
//-size (bytes) 4096
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
//-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 268435456
//-size (bytes) 536870912
//-size (bytes) 67108864
//-size (bytes) 536870912
//-size (bytes) 1073741824
# For 3D DRAM memory please use Gb as units
-size (Gb) 1
# Line size
//-block size (bytes) 8
-block size (bytes) 128
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
-associativity 1
//-associativity 16
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 8
//-technology (u) 0.032
//-technology (u) 0.040
//-technology (u) 0.065
-technology (u) 0.078
//-technology (u) 0.080
# following three parameters are meaningful only for main memories
//-page size (bits) 8192
-burst length 4
-internal prefetch width 1
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
//-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
-Data array cell type - "comm-dram"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
//-Data array peripheral type - "itrs-hp"
-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
//-output/input bus width 64
-output/input bus width 64
// 300-400 in steps of 10
-operating temperature (K) 350
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
//-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
-cache type "3D memory or 2D main memory"
## Parameters for 3D DRAM
-page size (bits) 16384
//-page size (bits) 8192
-burst depth 8
-IO width 4
-system frequency (MHz) 533
-stacked die count 1
-partitioning granularity 0 // 0: coarse-grained rank-level; 1: fine-grained rank-level
//-TSV projection 1 // 0: ITRS aggressive; 1: industrial conservative
## End of parameters for 3D DRAM
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 45
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
-access mode (normal, sequential, fast) - "fast"
//-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:0:10
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 50:100000:100000:100000:1000000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
-Wire outside mat - "global"
//-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
-Force cache config - "true"
//-Force cache config - "false"
-Ndwl 16
-Ndbl 16
-Nspd 1
-Ndcm 1
-Ndsam1 1
-Ndsam2 1

View file

@ -0,0 +1,197 @@
# Cache size
//-size (bytes) 528
//-size (bytes) 4096
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
//-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 268435456
//-size (bytes) 536870912
//-size (bytes) 67108864
//-size (bytes) 536870912
//-size (bytes) 1073741824
# For 3D DRAM memory please use Gb as units
-size (Gb) 8
# Line size
//-block size (bytes) 8
-block size (bytes) 128
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
-associativity 1
//-associativity 16
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 8
//-technology (u) 0.032
//-technology (u) 0.040
//-technology (u) 0.065
//-technology (u) 0.078
//-technology (u) 0.080
//-technology (u) 0.090
-technology (u) 0.050
# following three parameters are meaningful only for main memories
//-page size (bits) 8192
-burst length 4
-internal prefetch width 1
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
//-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
-Data array cell type - "comm-dram"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
//-Data array peripheral type - "itrs-hp"
-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
//-output/input bus width 64
-output/input bus width 64
// 300-400 in steps of 10
-operating temperature (K) 350
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
//-cache type "cache"
//-cache type "ram"
//-cache type "main memory" # old main memory model, in fact, it is eDRAM model.
-cache type "3D memory or 2D main memory" # once this parameter is used, the new parameter section below of <Parameters for 3D DRAM and 2D main memory> will override the same parameter above
# <Parameters for 3D DRAM and 2D main memory>
//-page size (bits) 16384
-page size (bits) 8192
//-page size (bits) 4096
-burst depth 8 // for 3D DRAM, IO per bank equals the product of burst depth and IO width
-IO width 4
-system frequency (MHz) 677
-stacked die count 4
-partitioning granularity 0 // 0: coarse-grained rank-level; 1: fine-grained rank-level
-TSV projection 1 // 0: ITRS aggressive; 1: industrial conservative
## End of parameters for 3D DRAM
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 45
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
-access mode (normal, sequential, fast) - "fast"
//-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:0:100
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 50:100000:100000:100000:1000000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
-Wire outside mat - "global"
//-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
-Force cache config - "true"
//-Force cache config - "false"
-Ndwl 16
-Ndbl 32
-Nspd 1
-Ndcm 1
-Ndsam1 1
-Ndsam2 1

122
T1/TP/TP1/cacti_7/README Normal file
View file

@ -0,0 +1,122 @@
-----------------------------------------------------------
____ __ ________ __
/\ _`\ /\ \__ __ /\_____ \ /'__`\
\ \ \/\_\ __ ___\ \ ,_\/\_\ \/___//'/'/\ \/\ \
\ \ \/_/_ /'__`\ /'___\ \ \/\/\ \ /' /' \ \ \ \ \
\ \ \L\ \/\ \L\.\_/\ \__/\ \ \_\ \ \ /' /'__ \ \ \_\ \
\ \____/\ \__/.\_\ \____\\ \__\\ \_\ /\_/ /\_\ \ \____/
\/___/ \/__/\/_/\/____/ \/__/ \/_/ \// \/_/ \/___/
A Tool to Model Caches/Memories, 3D stacking, and off-chip IO
-----------------------------------------------------------
CACTI is an analytical tool that takes a set of cache/memory para-
meters as input and calculates its access time, power, cycle
time, and area.
CACTI was originally developed by Dr. Jouppi and Dr. Wilton
in 1993 and since then it has undergone six major
revisions.
List of features (version 1-7):
===============================
The following is the list of features supported by the tool.
* Power, delay, area, and cycle time model for
direct mapped caches
set-associative caches
fully associative caches
Embedded DRAM memories
Commodity DRAM memories
* Support for modeling multi-ported uniform cache access (UCA)
and multi-banked, multi-ported non-uniform cache access (NUCA).
* Leakage power calculation that also considers the operating
temperature of the cache.
* Router power model.
* Interconnect model with different delay, power, and area
properties including low-swing wire model.
* An interface to perform trade-off analysis involving power, delay,
area, and bandwidth.
* All process specific values used by the tool are obtained
from ITRS and currently, the tool supports 90nm, 65nm, 45nm,
and 32nm technology nodes.
* Chip IO model to calculate latency and energy for DDR bus. Users can model
different loads (fan-outs) and evaluate the impact on frequency and energy.
This model can be used to study LR-DIMMs, R-DIMMs, etc.
Version 7.0 is derived from 6.5 and merged with CACTI 3D.
It has many new additions apart from code refinements and
bug fixes: new IO model, 3D memory model, and power gating models.
Ref: CACTI-IO: CACTI With OFF-chip Power-Area-Timing Models
MemCAD: An Interconnect Exploratory Tool for Innovative Memories Beyond DDR4
CACTI-3DD: Architecture-level modeling for 3D die-stacked DRAM main memory
--------------------------------------------------------------------------
Version 6.5 has a new c++ code base and includes numerous bug fixes.
CACTI 5.3 and 6.0 activate an entire row of mats to read/write a single
block of data. This technique improves reliability at the cost of
power. CACTI 6.5 activates minimum number of mats just enough to retrieve
a block to minimize power.
How to use the tool?
====================
Prior versions of CACTI take input parameters such as cache
size and technology node as a set of command line arguments.
To avoid a long list of command line arguments,
CACTI 6.5 & & let users specify their cache model in a more
detailed manner by using a config file (cache.cfg).
-> define the cache model using cache.cfg
-> run the "cacti" binary <./cacti -infile cache.cfg>
CACTI also provides a command line interface similar to earlier versions. The command line interface can be used as
./cacti cache_size line_size associativity rw_ports excl_read_ports excl_write_ports
single_ended_read_ports search_ports banks tech_node output_width specific_tag tag_width
access_mode cache main_mem obj_func_delay obj_func_dynamic_power obj_func_leakage_power
obj_func_cycle_time obj_func_area dev_func_delay dev_func_dynamic_power dev_func_leakage_power
dev_func_area dev_func_cycle_time ed_ed2_none temp wt data_arr_ram_cell_tech_flavor_in
data_arr_peri_global_tech_flavor_in tag_arr_ram_cell_tech_flavor_in tag_arr_peri_global_tech_flavor_in
interconnect_projection_type_in wire_inside_mat_type_in wire_outside_mat_type_in
REPEATERS_IN_HTREE_SEGMENTS_in VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in
BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in PAGE_SIZE_BITS_in BURST_LENGTH_in
INTERNAL_PREFETCH_WIDTH_in force_wiretype wiretype force_config ndwl ndbl nspd ndcm
ndsam1 ndsam2 ecc
For complete documentation of the tool, please refer
to the following publications and reports.
CACTI-5.3 & 6 reports - Details on Meory/cache organizations and tradeoffs.
Latency/Energy tradeoffs for large caches and NUCA design:
"Optimizing NUCA Organizations and Wiring Alternatives for Large Caches With CACTI 6.0", that appears in MICRO 2007.
Memory IO design: CACTI-IO: CACTI With OFF-chip Power-Area-Timing Models,
MemCAD: An Interconnect Exploratory Tool for Innovative Memories Beyond DDR4
CACTI-IO Technical Report - http://www.hpl.hp.com/techreports/2013/HPL-2013-79.pdf
3D model:
CACTI-3DD: Architecture-level modeling for 3D die-stacked DRAM main memory
We are still improving the tool and refining the code. If you
have any comments, questions, or suggestions please write to
us.
Naveen Muralimanohar
naveen.muralimanohar@hpe.com
Ali Shafiee
shafiee@cs.utah.edu
Vaishnav Srinivas
vaishnav.srinivas@gmail.com

242
T1/TP/TP1/cacti_7/TSV.cc Normal file
View file

@ -0,0 +1,242 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "TSV.h"
TSV::TSV(enum TSV_type tsv_type,
/*TechnologyParameter::*/DeviceType *dt)://TSV driver's device type set to be peri_global
deviceType(dt), tsv_type(tsv_type)
{
num_gates = 1;
num_gates_min = 1;//Is there a minimum number of stages?
min_w_pmos = deviceType -> n_to_p_eff_curr_drv_ratio * g_tp.min_w_nmos_;
switch (tsv_type)
{
case Fine:
cap = g_tp.tsv_parasitic_capacitance_fine;
res = g_tp.tsv_parasitic_resistance_fine;
min_area = g_tp.tsv_minimum_area_fine;
break;
case Coarse:
cap = g_tp.tsv_parasitic_capacitance_coarse;
res = g_tp.tsv_parasitic_resistance_coarse;
min_area = g_tp.tsv_minimum_area_coarse;
break;
default:
break;
}
for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++)
{
w_TSV_n[i] = 0;
w_TSV_p[i] = 0;
}
double first_buf_stg_coef = 5; // To tune the total buffer delay.
w_TSV_n[0] = g_tp.min_w_nmos_*first_buf_stg_coef;
w_TSV_p[0] = min_w_pmos *first_buf_stg_coef;
is_dram = 0;
is_wl_tr = 0;
//What does the function assert() mean? Should I put the function here?
compute_buffer_stage();
compute_area();
compute_delay();
}
TSV::~TSV()
{
}
void TSV::compute_buffer_stage()
{
double p_to_n_sz_ratio = deviceType->n_to_p_eff_curr_drv_ratio;
//BEOL parasitics in Katti's E modeling and charac. of TSV. Needs further detailed values.
//double res_beol = 0.1;//inaccurate
//double cap_beol = 1e-15;
//C_load_TSV = cap_beol + cap + cap_beol + gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0);
C_load_TSV = cap + gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0); //+ 57.5e-15;
if(g_ip->print_detail_debug)
{
cout << " The input cap of 1st buffer: " << gate_C(w_TSV_n[0] + w_TSV_p[0], 0) * 1e15 << " fF";
}
double F = C_load_TSV / gate_C(w_TSV_n[0] + w_TSV_p[0], 0);
if(g_ip->print_detail_debug)
{
cout<<"\nF is "<<F<<" \n";
}
//Obtain buffer chain stages using logic effort function. Does stage number have to be even?
num_gates = logical_effort(
num_gates_min,
1,
F,
w_TSV_n,
w_TSV_p,
C_load_TSV,
p_to_n_sz_ratio,
is_dram,
is_wl_tr,
g_tp.max_w_nmos_/*Correct? Decoder uses max_w_nmos_dec*/);
}
void TSV::compute_area()
{
//Obtain the driver chain area and leakage power for TSV
double Vdd = deviceType->Vdd;
double cumulative_area = 0;
double cumulative_curr = 0; // cumulative leakage current
double cumulative_curr_Ig = 0; // cumulative leakage current
Buffer_area.h = g_tp.cell_h_def;//cell_h_def is the assigned height for memory cell (5um), is it correct to use it here?
//logic_effort() didn't give the size of w_n[0] and w_p[0], which is min size inverter
//w_TSV_n[0] = g_tp.min_w_nmos_;
//w_TSV_p[0] = min_w_pmos;
int i;
for (i = 0; i < num_gates; i++)
{
cumulative_area += compute_gate_area(INV, 1, w_TSV_p[i], w_TSV_n[i], Buffer_area.h);
if(g_ip->print_detail_debug)
{
cout << "\n\tArea up to the " << i+1 << " stages is: " << cumulative_area << " um2";
}
cumulative_curr += cmos_Isub_leakage(w_TSV_n[i], w_TSV_p[i], 1, inv, is_dram);
cumulative_curr_Ig += cmos_Ig_leakage(w_TSV_n[i], w_TSV_p[i], 1, inv, is_dram);// The operator += is mistakenly put as = in decoder.cc
}
power.readOp.leakage = cumulative_curr * Vdd;
power.readOp.gate_leakage = cumulative_curr_Ig * Vdd;
Buffer_area.set_area(cumulative_area);
Buffer_area.w = (cumulative_area / Buffer_area.h);
TSV_metal_area.set_area(min_area * 3.1416/16);
if( Buffer_area.get_area() < min_area - TSV_metal_area.get_area() )
area.set_area(min_area);
else
area.set_area(Buffer_area.get_area() + TSV_metal_area.get_area());
}
void TSV::compute_delay()
{
//Buffer chain delay and Dynamic Power
double rd, tf, this_delay, c_load, c_intrinsic, inrisetime = 0/*The initial time*/;
//is_dram, is_wl_tr are declared to be false in the constructor
rd = tr_R_on(w_TSV_n[0], NCH, 1, is_dram, false, is_wl_tr);
c_load = gate_C(w_TSV_n[1] + w_TSV_p[1], 0.0, is_dram, false, is_wl_tr);
c_intrinsic = drain_C_(w_TSV_p[0], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) +
drain_C_(w_TSV_n[0], NCH, 1, 1, area.h, is_dram, false, is_wl_tr);
tf = rd * (c_intrinsic + c_load);
//Refer to horowitz function definition
this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
delay += this_delay;
inrisetime = this_delay / (1.0 - 0.5);
double Vdd = deviceType -> Vdd;
power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
int i;
for (i = 1; i < num_gates - 1; ++i)
{
rd = tr_R_on(w_TSV_n[i], NCH, 1, is_dram, false, is_wl_tr);
c_load = gate_C(w_TSV_p[i+1] + w_TSV_n[i+1], 0.0, is_dram, false, is_wl_tr);
c_intrinsic = drain_C_(w_TSV_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) +
drain_C_(w_TSV_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr);
tf = rd * (c_intrinsic + c_load);
this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
delay += this_delay;
inrisetime = this_delay / (1.0 - 0.5);
power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
}
// add delay of final inverter that drives the TSV
i = num_gates - 1;
c_load = C_load_TSV;
rd = tr_R_on(w_TSV_n[i], NCH, 1, is_dram, false, is_wl_tr);
c_intrinsic = drain_C_(w_TSV_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) +
drain_C_(w_TSV_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr);
//The delay method for the last stage of buffer chain in Decoder.cc
//double res_beol = 0.1;//inaccurate
//double R_TSV_out = res_beol + res + res_beol;
double R_TSV_out = res;
tf = rd * (c_intrinsic + c_load) + R_TSV_out * c_load / 2;
this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
delay += this_delay;
power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd; //Dynamic power done
//Is the delay actually delay/(1.0-0.5)??
//ret_val = this_delay / (1.0 - 0.5);
//return ret_val;//Originally for decoder.cc to get outrise time
/* This part is to obtain delay in the TSV path, refer to Katti's paper.
* It can be used alternatively as the step to get the final-stage delay
double C_ext = c_intrinsic;
R_dr = rd;
double C_int = gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0.0, is_dram, false, is_wl_tr);
delay_TSV_path = 0.693 * (R_dr * C_ext + (R_dr + res_beol) * cap_beol + (R_dr + res_beol + 0.5 * res) * cap
+ (R_dr + res_beol + res + res_beol) * (cap_beol + C_int);
delay += delay_TSV_path;
*/
}
void TSV::print_TSV()
{
cout << "\nTSV Properties:\n\n";
cout << " Delay Optimal - "<<
" \n\tTSV Cap: " << cap * 1e15 << " fF" <<
" \n\tTSV Res: " << res * 1e3 << " mOhm"<<
" \n\tNumber of Buffer Chain stages - " << num_gates <<
" \n\tDelay - " << delay * 1e9 << " (ns) "
" \n\tPowerD - " << power.readOp.dynamic * 1e9<< " (nJ)"
" \n\tPowerL - " << power.readOp.leakage * 1e3<< " (mW)"
" \n\tPowerLgate - " << power.readOp.gate_leakage * 1e3<< " (mW)\n" <<
" \n\tBuffer Area: " << Buffer_area.get_area() << " um2" <<
" \n\tBuffer Height: " << Buffer_area.h << " um" <<
" \n\tBuffer Width: " << Buffer_area.w << " um" <<
" \n\tTSV metal area: " << TSV_metal_area.get_area() << " um2" <<
" \n\tTSV minimum occupied area: " <<min_area << " um2"<<
" \n\tTotal area: " << area.get_area() << " um2";
cout <<endl;
cout <<endl;
}

96
T1/TP/TP1/cacti_7/TSV.h Normal file
View file

@ -0,0 +1,96 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef TSV_H_
#define TSV_H_
#include "basic_circuit.h"
#include "component.h"
#include "parameter.h"
//#include "assert.h"
#include "cacti_interface.h"
#include "const.h"
//#include "area.h"
#include <cmath>
#include <iostream>
#include <list>
class TSV : public Component
{
public:
TSV(enum TSV_type tsv_type,
/*TechnologyParameter::*/DeviceType * dt = &(g_tp.peri_global));//Should change peri_global to TSV in technology.cc
//TSV():len(20),rad(2.5),pitch(50){}
~TSV();
double res;//TSV resistance
double cap;//TSV capacitance
double C_load_TSV;//The intrinsic load plus the load TSV is driving, needs changes?
double min_area;
//int num_IO;//number of I/O
int num_gates;
int num_gates_min;//Necessary?
double w_TSV_n[MAX_NUMBER_GATES_STAGE];
double w_TSV_p[MAX_NUMBER_GATES_STAGE];
//double delay_TSV_path;//Delay of TSV path including the parasitics
double is_dram;//two external arguments, defaulted to be false in constructor
double is_wl_tr;
void compute_buffer_stage();
void compute_area();
void compute_delay();
void print_TSV();
Area TSV_metal_area;
Area Buffer_area;
/*//Herigated from Component
double delay;
Area area;
powerDef power, rt_power;
double delay;
double cycle_time;
int logical_effort();*/
private:
double min_w_pmos;
/*TechnologyParameter::*/DeviceType * deviceType;
unsigned int tsv_type;
};
#endif /* TSV_H_ */

1073
T1/TP/TP1/cacti_7/Ucache.cc Normal file

File diff suppressed because it is too large Load diff

118
T1/TP/TP1/cacti_7/Ucache.h Normal file
View file

@ -0,0 +1,118 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __UCACHE_H__
#define __UCACHE_H__
#include <list>
#include "area.h"
#include "router.h"
#include "nuca.h"
class min_values_t
{
public:
double min_delay;
double min_dyn;
double min_leakage;
double min_area;
double min_cyc;
min_values_t() : min_delay(BIGNUM), min_dyn(BIGNUM), min_leakage(BIGNUM), min_area(BIGNUM), min_cyc(BIGNUM) { }
void update_min_values(const min_values_t * val);
void update_min_values(const uca_org_t & res);
void update_min_values(const nuca_org_t * res);
void update_min_values(const mem_array * res);
};
struct solution
{
int tag_array_index;
int data_array_index;
list<mem_array *>::iterator tag_array_iter;
list<mem_array *>::iterator data_array_iter;
double access_time;
double cycle_time;
double area;
double efficiency;
powerDef total_power;
};
bool calculate_time(
bool is_tag,
int pure_ram,
bool pure_cam,
double Nspd,
unsigned int Ndwl,
unsigned int Ndbl,
unsigned int Ndcm,
unsigned int Ndsam_lev_1,
unsigned int Ndsam_lev_2,
mem_array *ptr_array,
int flag_results_populate,
results_mem_array *ptr_results,
uca_org_t *ptr_fin_res,
Wire_type wtype, // merge from cacti-7 to cacti3d
bool is_main_mem);
void update(uca_org_t *fin_res);
void solve(uca_org_t *fin_res);
void init_tech_params(double tech, bool is_tag);
struct calc_time_mt_wrapper_struct
{
uint32_t tid;
bool is_tag;
bool pure_ram;
bool pure_cam;
bool is_main_mem;
double Nspd_min;
min_values_t * data_res;
min_values_t * tag_res;
list<mem_array *> data_arr;
list<mem_array *> tag_arr;
};
void *calc_time_mt_wrapper(void * void_obj);
void print_g_tp();
#endif

53
T1/TP/TP1/cacti_7/_script.py Executable file
View file

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
"""
TP1 T1
Ceci est un script permettant de tester différentes configurations
de cache pour cacti
"""
import os
chemin_fichier_config = "_fichiers/configs/"
chemin_fichier_result = "_fichiers/resultats/"
configurations = [[1,0,0,0],[1,0,1,1],[1,0,0,1],[1,1,1,1]]
nom_fichier_config = "cache.cfg"
i = 0
#----------- Cette section permet de configurer le nombre de ports entrée/sortie du fichier ---------
with open(chemin_fichier_config + nom_fichier_config, "r") as fichier:
Lignes = fichier.readlines()
index = 0
for ligne in Lignes:
if "<configs_ports>" in ligne:
Lignes[index + 1] = "-read-write port {}\n".format(configurations[i][0])
Lignes[index + 2] = "-exclusive read port {}\n".format(configurations[i][1])
Lignes[index + 3] = "-exclusive write port {}\n".format(configurations[i][2])
Lignes[index + 4] = "-single ended read ports {}\n".format(configurations[i][3])
break
index += 1
with open(chemin_fichier_config + nom_fichier_config, "w") as fichier:
fichier.writelines(Lignes)
#----------- Section pour lancer la simulation sur cacti -------------------------------
commande = "./cacti -infile {}".format(chemin_fichier_config + nom_fichier_config)
flux = os.popen(commande)
resultat = flux.read().split('\n')
for ligne in resultat:
if "Access time (ns):" in ligne:
val_tps = float(ligne.split(':')[1])
print(val_tps)
#flux = os.popen(commande)
#print(flux.read())

View file

@ -0,0 +1,130 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "arbiter.h"
Arbiter::Arbiter(
double n_req,
double flit_size_,
double output_len,
/*TechnologyParameter::*/DeviceType *dt
):R(n_req), flit_size(flit_size_),
o_len (output_len), deviceType(dt)
{
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
Vdd = dt->Vdd;
double technology = g_ip->F_sz_um;
NTn1 = 13.5*technology/2;
PTn1 = 76*technology/2;
NTn2 = 13.5*technology/2;
PTn2 = 76*technology/2;
NTi = 12.5*technology/2;
PTi = 25*technology/2;
NTtr = 10*technology/2; /*Transmission gate's nmos tr. length*/
PTtr = 20*technology/2; /* pmos tr. length*/
}
Arbiter::~Arbiter(){}
double
Arbiter::arb_req() {
double temp = ((R-1)*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)) + 2*gate_C(NTn2, 0) +
gate_C(PTn2, 0) + gate_C(NTi, 0) + gate_C(PTi, 0) +
drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def));
return temp;
}
double
Arbiter::arb_pri() {
double temp = 2*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)); /* switching capacitance
of flip-flop is ignored */
return temp;
}
double
Arbiter::arb_grant() {
double temp = drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) + crossbar_ctrline();
return temp;
}
double
Arbiter::arb_int() {
double temp = (drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) +
2*gate_C(NTn2, 0) + gate_C(PTn2, 0));
return temp;
}
void
Arbiter::compute_power() {
power.readOp.dynamic = (R*arb_req()*Vdd*Vdd/2 + R*arb_pri()*Vdd*Vdd/2 +
arb_grant()*Vdd*Vdd + arb_int()*0.5*Vdd*Vdd);
double nor1_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
double nor2_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
double not_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
double nor1_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
double nor2_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
double not_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
power.readOp.leakage = (nor1_leak + nor2_leak + not_leak)*Vdd; //FIXME include priority table leakage
power.readOp.gate_leakage = nor1_leak_gate*Vdd + nor2_leak_gate*Vdd + not_leak_gate*Vdd;
}
double //wire cap with triple spacing
Arbiter::Cw3(double length) {
Wire wc(g_ip->wt, length, 1, 3, 3);
double temp = (wc.wire_cap(length,true));
return temp;
}
double
Arbiter::crossbar_ctrline() {
double temp = (Cw3(o_len * 1e-6 /* m */) +
drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def) +
gate_C(NTi, 0) + gate_C(PTi, 0));
return temp;
}
double
Arbiter::transmission_buf_ctrcap() {
double temp = gate_C(NTtr, 0)+gate_C(PTtr, 0);
return temp;
}
void Arbiter::print_arbiter()
{
cout << "\nArbiter Stats (" << R << " input arbiter" << ")\n\n";
cout << "Flit size : " << flit_size << " bits" << endl;
cout << "Dynamic Power : " << power.readOp.dynamic*1e9 << " (nJ)" << endl;
cout << "Leakage Power : " << power.readOp.leakage*1e3 << " (mW)" << endl;
}

View file

@ -0,0 +1,77 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __ARBITER__
#define __ARBITER__
#include <assert.h>
#include <iostream>
#include "basic_circuit.h"
#include "cacti_interface.h"
#include "component.h"
#include "parameter.h"
#include "mat.h"
#include "wire.h"
class Arbiter : public Component
{
public:
Arbiter(
double Req,
double flit_sz,
double output_len,
/*TechnologyParameter::*/DeviceType *dt = &(g_tp.peri_global));
~Arbiter();
void print_arbiter();
double arb_req();
double arb_pri();
double arb_grant();
double arb_int();
void compute_power();
double Cw3(double len);
double crossbar_ctrline();
double transmission_buf_ctrcap();
private:
double NTn1, PTn1, NTn2, PTn2, R, PTi, NTi;
double flit_size;
double NTtr, PTtr;
double o_len;
/*TechnologyParameter::*/DeviceType *deviceType;
double TriS1, TriS2;
double min_w_pmos, Vdd;
};
#endif

46
T1/TP/TP1/cacti_7/area.cc Normal file
View file

@ -0,0 +1,46 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "area.h"
#include "component.h"
#include "decoder.h"
#include "parameter.h"
#include "basic_circuit.h"
#include <iostream>
#include <math.h>
#include <assert.h>
using namespace std;

71
T1/TP/TP1/cacti_7/area.h Normal file
View file

@ -0,0 +1,71 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __AREA_H__
#define __AREA_H__
#include "cacti_interface.h"
#include "basic_circuit.h"
using namespace std;
class Area
{
public:
double w;
double h;
Area():w(0), h(0), area(0) { }
double get_w() const { return w; }
double get_h() const { return h; }
double get_area() const
{
if (w == 0 && h == 0)
{
return area;
}
else
{
return w*h;
}
}
void set_w(double w_) { w = w_; }
void set_h(double h_) { h = h_; }
void set_area(double a_) { area = a_; }
private:
double area;
};
#endif

206
T1/TP/TP1/cacti_7/bank.cc Normal file
View file

@ -0,0 +1,206 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "bank.h"
#include <iostream>
Bank::Bank(const DynamicParameter & dyn_p):
dp(dyn_p), mat(dp),
num_addr_b_mat(dyn_p.number_addr_bits_mat),
num_mats_hor_dir(dyn_p.num_mats_h_dir), num_mats_ver_dir(dyn_p.num_mats_v_dir),
array_leakage(0),
wl_leakage(0),
cl_leakage(0)
{
// Mat temp(dyn_p);
int RWP;
int ERP;
int EWP;
int SCHP;
if (dp.use_inp_params)
{
RWP = dp.num_rw_ports;
ERP = dp.num_rd_ports;
EWP = dp.num_wr_ports;
SCHP = dp.num_search_ports;
}
else
{
RWP = g_ip->num_rw_ports;
ERP = g_ip->num_rd_ports;
EWP = g_ip->num_wr_ports;
SCHP = g_ip->num_search_ports;
}
int total_addrbits = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
int datainbits = dp.num_di_b_bank_per_port * (RWP + EWP);
int dataoutbits = dp.num_do_b_bank_per_port * (RWP + ERP);
int searchinbits;
int searchoutbits;
if (dp.fully_assoc || dp.pure_cam)
{
datainbits = dp.num_di_b_bank_per_port * (RWP + EWP);
dataoutbits = dp.num_do_b_bank_per_port * (RWP + ERP);
searchinbits = dp.num_si_b_bank_per_port * SCHP;
searchoutbits = dp.num_so_b_bank_per_port * SCHP;
}
if (!(dp.fully_assoc || dp.pure_cam))
{
if (g_ip->fast_access && dp.is_tag == false)
{
dataoutbits *= g_ip->data_assoc;
}
htree_in_add = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
htree_in_data = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
htree_out_data = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
// htree_out_data = new Htree2 (g_ip->wt,(double) 100, (double)100,
// total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
area.w = htree_in_data->area.w;
area.h = htree_in_data->area.h;
}
else
{
htree_in_add = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits, searchinbits,dataoutbits,searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
htree_in_data = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
htree_out_data = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
htree_in_search = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree,true, true);
htree_out_search = new Htree2 (dp.wtype/*g_ip->wt*/,(double) mat.area.w, (double)mat.area.h,
total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree,true);
area.w = htree_in_data->area.w;
area.h = htree_in_data->area.h;
}
num_addr_b_row_dec = _log2(mat.subarray.num_rows);
num_addr_b_routed_to_mat_for_act = num_addr_b_row_dec;
num_addr_b_routed_to_mat_for_rd_or_wr = num_addr_b_mat - num_addr_b_row_dec;
}
Bank::~Bank()
{
delete htree_in_add;
delete htree_out_data;
delete htree_in_data;
if (dp.fully_assoc || dp.pure_cam)
{
delete htree_in_search;
delete htree_out_search;
}
}
double Bank::compute_delays(double inrisetime)
{
return mat.compute_delays(inrisetime);
}
void Bank::compute_power_energy()
{
mat.compute_power_energy();
if (!(dp.fully_assoc || dp.pure_cam))
{
power.readOp.dynamic += mat.power.readOp.dynamic * dp.num_act_mats_hor_dir;
power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
array_leakage += mat.array_leakage*dp.num_mats;
wl_leakage += mat.wl_leakage*dp.num_mats;
cl_leakage += mat.cl_leakage*dp.num_mats;
//
// power.readOp.leakage += htree_in_add->power.readOp.leakage;
// power.readOp.leakage += htree_in_data->power.readOp.leakage;
// power.readOp.leakage += htree_out_data->power.readOp.leakage;
// power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
// power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
// power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
}
else
{
power.readOp.dynamic += mat.power.readOp.dynamic ;//for fa and cam num_act_mats_hor_dir is 1 for plain r/w
power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
power.searchOp.dynamic += mat.power.searchOp.dynamic * dp.num_mats;
power.searchOp.dynamic += mat.power_bl_precharge_eq_drv.searchOp.dynamic +
mat.power_sa.searchOp.dynamic +
mat.power_bitline.searchOp.dynamic +
mat.power_subarray_out_drv.searchOp.dynamic+
mat.ml_to_ram_wl_drv->power.readOp.dynamic;
power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
power.searchOp.dynamic += htree_in_search->power.searchOp.dynamic;
power.searchOp.dynamic += htree_out_search->power.searchOp.dynamic;
power.readOp.leakage += htree_in_add->power.readOp.leakage;
power.readOp.leakage += htree_in_data->power.readOp.leakage;
power.readOp.leakage += htree_out_data->power.readOp.leakage;
power.readOp.leakage += htree_in_search->power.readOp.leakage;
power.readOp.leakage += htree_out_search->power.readOp.leakage;
power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
power.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage;
power.readOp.gate_leakage += htree_out_search->power.readOp.gate_leakage;
}
}

74
T1/TP/TP1/cacti_7/bank.h Normal file
View file

@ -0,0 +1,74 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __BANK_H__
#define __BANK_H__
#include "component.h"
#include "decoder.h"
#include "mat.h"
#include "htree2.h"
class Bank : public Component
{
public:
Bank(const DynamicParameter & dyn_p);
~Bank();
double compute_delays(double inrisetime); // return outrisetime
void compute_power_energy();
const DynamicParameter & dp;
Mat mat;
Htree2 *htree_in_add;
Htree2 *htree_in_data;
Htree2 *htree_out_data;
Htree2 *htree_in_search;
Htree2 *htree_out_search;
int num_addr_b_mat;
int num_mats_hor_dir;
int num_mats_ver_dir;
int num_addr_b_row_dec;
int num_addr_b_routed_to_mat_for_act;
int num_addr_b_routed_to_mat_for_rd_or_wr;
double array_leakage;
double wl_leakage;
double cl_leakage;
};
#endif

View file

@ -0,0 +1,999 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "basic_circuit.h"
#include "parameter.h"
#include <iostream>
#include <assert.h>
#include <cmath>
uint32_t _log2(uint64_t num)
{
uint32_t log2 = 0;
if (num == 0)
{
std::cerr << "log0?" << std::endl;
exit(1);
}
while (num > 1)
{
num = (num >> 1);
log2++;
}
return log2;
}
bool is_pow2(int64_t val)
{
if (val <= 0)
{
return false;
}
else if (val == 1)
{
return true;
}
else
{
return (_log2(val) != _log2(val-1));
}
}
int powers (int base, int n)
{
int i, p;
p = 1;
for (i = 1; i <= n; ++i)
p *= base;
return p;
}
/*----------------------------------------------------------------------*/
double logtwo (double x)
{
assert(x > 0);
return ((double) (log (x) / log (2.0)));
}
/*----------------------------------------------------------------------*/
double gate_C(
double width,
double wirelength,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
const /*TechnologyParameter::*/DeviceType * dt;
if (_is_dram && _is_cell)
{
dt = &g_tp.dram_acc; //DRAM cell access transistor
}
else if (_is_dram && _is_wl_tr)
{
dt = &g_tp.dram_wl; //DRAM wordline transistor
}
else if (!_is_dram && _is_cell)
{
dt = &g_tp.sram_cell; // SRAM cell access transistor
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{
dt = &g_tp.peri_global;
}
return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
}
// returns gate capacitance in Farads
// actually this function is the same as gate_C() now
double gate_C_pass(
double width, // gate width in um (length is Lphy_periph_global)
double wirelength, // poly wire length going to gate in lambda
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
// v5.0
const /*TechnologyParameter::*/DeviceType * dt;
if ((_is_dram) && (_is_cell))
{
dt = &g_tp.dram_acc; //DRAM cell access transistor
}
else if ((_is_dram) && (_is_wl_tr))
{
dt = &g_tp.dram_wl; //DRAM wordline transistor
}
else if ((!_is_dram) && _is_cell)
{
dt = &g_tp.sram_cell; // SRAM cell access transistor
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{
dt = &g_tp.peri_global;
}
return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
}
double drain_C_(
double width,
int nchannel,
int stack,
int next_arg_thresh_folding_width_or_height_cell,
double fold_dimension,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
double w_folded_tr;
const /*TechnologyParameter::*/DeviceType * dt;
if ((_is_dram) && (_is_cell))
{
dt = &g_tp.dram_acc; // DRAM cell access transistor
}
else if ((_is_dram) && (_is_wl_tr))
{
dt = &g_tp.dram_wl; // DRAM wordline transistor
}
else if ((!_is_dram) && _is_cell)
{
dt = &g_tp.sram_cell; // SRAM cell access transistor
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{
dt = &g_tp.peri_global;
}
double c_junc_area = dt->C_junc;
double c_junc_sidewall = dt->C_junc_sidewall;
double c_fringe = 2*dt->C_fringe;
double c_overlap = 2*dt->C_overlap;
double drain_C_metal_connecting_folded_tr = 0;
// determine the width of the transistor after folding (if it is getting folded)
if (next_arg_thresh_folding_width_or_height_cell == 0)
{ // interpret fold_dimension as the the folding width threshold
// i.e. the value of transistor width above which the transistor gets folded
w_folded_tr = fold_dimension;
}
else
{ // interpret fold_dimension as the height of the cell that this transistor is part of.
double h_tr_region = fold_dimension - 2 * g_tp.HPOWERRAIL;
// TODO : w_folded_tr must come from Component::compute_gate_area()
double ratio_p_to_n = 2.0 / (2.0 + 1.0);
if (nchannel)
{
w_folded_tr = (1 - ratio_p_to_n) * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
}
else
{
w_folded_tr = ratio_p_to_n * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
}
}
int num_folded_tr = (int) (ceil(width / w_folded_tr));
if (num_folded_tr < 2)
{
w_folded_tr = width;
}
double total_drain_w = (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) + // only for drain
(stack - 1) * g_tp.spacing_poly_to_poly;
double drain_h_for_sidewall = w_folded_tr;
double total_drain_height_for_cap_wrt_gate = w_folded_tr + 2 * w_folded_tr * (stack - 1);
if (num_folded_tr > 1)
{
total_drain_w += (num_folded_tr - 2) * (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +
(num_folded_tr - 1) * ((stack - 1) * g_tp.spacing_poly_to_poly);
if (num_folded_tr%2 == 0)
{
drain_h_for_sidewall = 0;
}
total_drain_height_for_cap_wrt_gate *= num_folded_tr;
drain_C_metal_connecting_folded_tr = g_tp.wire_local.C_per_um * total_drain_w;
}
double drain_C_area = c_junc_area * total_drain_w * w_folded_tr;
double drain_C_sidewall = c_junc_sidewall * (drain_h_for_sidewall + 2 * total_drain_w);
double drain_C_wrt_gate = (c_fringe + c_overlap) * total_drain_height_for_cap_wrt_gate;
return (drain_C_area + drain_C_sidewall + drain_C_wrt_gate + drain_C_metal_connecting_folded_tr);
}
double tr_R_on(
double width,
int nchannel,
int stack,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
const /*TechnologyParameter::*/DeviceType * dt;
if ((_is_dram) && (_is_cell))
{
dt = &g_tp.dram_acc; //DRAM cell access transistor
}
else if ((_is_dram) && (_is_wl_tr))
{
dt = &g_tp.dram_wl; //DRAM wordline transistor
}
else if ((!_is_dram) && _is_cell)
{
dt = &g_tp.sram_cell; // SRAM cell access transistor
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{
dt = &g_tp.peri_global;
}
double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
return (stack * restrans / width);
}
/* This routine operates in reverse: given a resistance, it finds
* the transistor width that would have this R. It is used in the
* data wordline to estimate the wordline driver size. */
// returns width in um
double R_to_w(
double res,
int nchannel,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
const /*TechnologyParameter::*/DeviceType * dt;
if ((_is_dram) && (_is_cell))
{
dt = &g_tp.dram_acc; //DRAM cell access transistor
}
else if ((_is_dram) && (_is_wl_tr))
{
dt = &g_tp.dram_wl; //DRAM wordline transistor
}
else if ((!_is_dram) && (_is_cell))
{
dt = &g_tp.sram_cell; // SRAM cell access transistor
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{
dt = &g_tp.peri_global;
}
double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
return (restrans / res);
}
double pmos_to_nmos_sz_ratio(
bool _is_dram,
bool _is_wl_tr,
bool _is_sleep_tx)
{
double p_to_n_sizing_ratio;
if ((_is_dram) && (_is_wl_tr))
{ //DRAM wordline transistor
p_to_n_sizing_ratio = g_tp.dram_wl.n_to_p_eff_curr_drv_ratio;
}
else if (_is_sleep_tx)
{
p_to_n_sizing_ratio = g_tp.sleep_tx.n_to_p_eff_curr_drv_ratio; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
p_to_n_sizing_ratio = g_tp.peri_global.n_to_p_eff_curr_drv_ratio;
}
return p_to_n_sizing_ratio;
}
// "Timing Models for MOS Circuits" by Mark Horowitz, 1984
double horowitz(
double inputramptime, // input rise time
double tf, // time constant of gate
double vs1, // threshold voltage
double vs2, // threshold voltage
int rise) // whether input rises or fall
{
if (inputramptime == 0 && vs1 == vs2)
{
return tf * (vs1 < 1 ? -log(vs1) : log(vs1));
}
double a, b, td;
a = inputramptime / tf;
if (rise == RISE)
{
b = 0.5;
td = tf * sqrt(log(vs1)*log(vs1) + 2*a*b*(1.0 - vs1)) + tf*(log(vs1) - log(vs2));
}
else
{
b = 0.4;
td = tf * sqrt(log(1.0 - vs1)*log(1.0 - vs1) + 2*a*b*(vs1)) + tf*(log(1.0 - vs1) - log(1.0 - vs2));
}
return (td);
}
double cmos_Ileak(
double nWidth,
double pWidth,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
/*TechnologyParameter::*/DeviceType * dt;
if ((!_is_dram)&&(_is_cell))
{ //SRAM cell access transistor
dt = &(g_tp.sram_cell);
}
else if ((_is_dram)&&(_is_wl_tr))
{ //DRAM wordline transistor
dt = &(g_tp.dram_wl);
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
dt = &(g_tp.peri_global);
}
return nWidth*dt->I_off_n + pWidth*dt->I_off_p;
}
int factorial(int n, int m)
{
int fa = m, i;
for (i=m+1; i<=n; i++)
fa *=i;
return fa;
}
int combination(int n, int m)
{
int ret;
ret = factorial(n, m+1) / factorial(n - m);
return ret;
}
double simplified_nmos_Isat(
double nwidth,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
/*TechnologyParameter::*/DeviceType * dt;
if ((!_is_dram)&&(_is_cell))
{ //SRAM cell access transistor
dt = &(g_tp.sram_cell);
}
else if ((_is_dram)&&(_is_wl_tr))
{ //DRAM wordline transistor
dt = &(g_tp.dram_wl);
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
dt = &(g_tp.peri_global);
}
return nwidth * dt->I_on_n;
}
double simplified_pmos_Isat(
double pwidth,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
/*TechnologyParameter::*/DeviceType * dt;
if ((!_is_dram)&&(_is_cell))
{ //SRAM cell access transistor
dt = &(g_tp.sram_cell);
}
else if ((_is_dram)&&(_is_wl_tr))
{ //DRAM wordline transistor
dt = &(g_tp.dram_wl);
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
dt = &(g_tp.peri_global);
}
return pwidth * dt->I_on_n/dt->n_to_p_eff_curr_drv_ratio;
}
double simplified_nmos_leakage(
double nwidth,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
/*TechnologyParameter::*/DeviceType * dt;
if ((!_is_dram)&&(_is_cell))
{ //SRAM cell access transistor
dt = &(g_tp.sram_cell);
}
else if ((_is_dram)&&(_is_wl_tr))
{ //DRAM wordline transistor
dt = &(g_tp.dram_wl);
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
dt = &(g_tp.peri_global);
}
return nwidth * dt->I_off_n;
}
double simplified_pmos_leakage(
double pwidth,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
/*TechnologyParameter::*/DeviceType * dt;
if ((!_is_dram)&&(_is_cell))
{ //SRAM cell access transistor
dt = &(g_tp.sram_cell);
}
else if ((_is_dram)&&(_is_wl_tr))
{ //DRAM wordline transistor
dt = &(g_tp.dram_wl);
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
dt = &(g_tp.peri_global);
}
return pwidth * dt->I_off_p;
}
double cmos_Ig_n(
double nWidth,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
/*TechnologyParameter::*/DeviceType * dt;
if ((!_is_dram)&&(_is_cell))
{ //SRAM cell access transistor
dt = &(g_tp.sram_cell);
}
else if ((_is_dram)&&(_is_wl_tr))
{ //DRAM wordline transistor
dt = &(g_tp.dram_wl);
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
dt = &(g_tp.peri_global);
}
return nWidth*dt->I_g_on_n;
}
double cmos_Ig_p(
double pWidth,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx)
{
/*TechnologyParameter::*/DeviceType * dt;
if ((!_is_dram)&&(_is_cell))
{ //SRAM cell access transistor
dt = &(g_tp.sram_cell);
}
else if ((_is_dram)&&(_is_wl_tr))
{ //DRAM wordline transistor
dt = &(g_tp.dram_wl);
}
else if (_is_sleep_tx)
{
dt = &g_tp.sleep_tx; // Sleep transistor
}
else
{ //DRAM or SRAM all other transistors
dt = &(g_tp.peri_global);
}
return pWidth*dt->I_g_on_p;
}
double cmos_Isub_leakage(
double nWidth,
double pWidth,
int fanin,
enum Gate_type g_type,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx,
enum Half_net_topology topo)
{
assert (fanin>=1);
double nmos_leak = simplified_nmos_leakage(nWidth, _is_dram, _is_cell, _is_wl_tr, _is_sleep_tx);
double pmos_leak = simplified_pmos_leakage(pWidth, _is_dram, _is_cell, _is_wl_tr, _is_sleep_tx);
double Isub=0;
int num_states;
int num_off_tx;
num_states = int(pow(2.0, fanin));
switch (g_type)
{
case nmos:
if (fanin==1)
{
Isub = nmos_leak/num_states;
}
else
{
if (topo==parallel)
{
Isub=nmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
}
else
{
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
{
//Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
}
Isub /=num_states;
}
}
break;
case pmos:
if (fanin==1)
{
Isub = pmos_leak/num_states;
}
else
{
if (topo==parallel)
{
Isub=pmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
}
else
{
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
{
//Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
}
Isub /=num_states;
}
}
break;
case inv:
Isub = (nmos_leak + pmos_leak)/2;
break;
case nand:
Isub += fanin*pmos_leak;//the pullup network
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pulldown network
{
//Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
}
Isub /=num_states;
break;
case nor:
for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pullup network
{
//Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
}
Isub += fanin*nmos_leak;//the pulldown network
Isub /=num_states;
break;
case tri:
Isub += (nmos_leak + pmos_leak)/2;//enabled
Isub += nmos_leak*UNI_LEAK_STACK_FACTOR; //disabled upper bound of leakage power
Isub /=2;
break;
case tg:
Isub = (nmos_leak + pmos_leak)/2;
break;
default:
assert(0);
break;
}
return Isub;
}
double cmos_Ig_leakage(
double nWidth,
double pWidth,
int fanin,
enum Gate_type g_type,
bool _is_dram,
bool _is_cell,
bool _is_wl_tr,
bool _is_sleep_tx,
enum Half_net_topology topo)
{
assert (fanin>=1);
double nmos_leak = cmos_Ig_n(nWidth, _is_dram, _is_cell, _is_wl_tr, _is_sleep_tx);
double pmos_leak = cmos_Ig_p(pWidth, _is_dram, _is_cell, _is_wl_tr, _is_sleep_tx);
double Ig_on=0;
int num_states;
int num_on_tx;
num_states = int(pow(2.0, fanin));
switch (g_type)
{
case nmos:
if (fanin==1)
{
Ig_on = nmos_leak/num_states;
}
else
{
if (topo==parallel)
{
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
{
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
}
}
else
{
Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
//num_on_tx is the number of on tx
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
{
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
}
Ig_on /=num_states;
}
}
break;
case pmos:
if (fanin==1)
{
Ig_on = pmos_leak/num_states;
}
else
{
if (topo==parallel)
{
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
{
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
}
}
else
{
Ig_on += pmos_leak * fanin;//pull down network when all TXs are on.
//num_on_tx is the number of on tx
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
{
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
}
Ig_on /=num_states;
}
}
break;
case inv:
Ig_on = (nmos_leak + pmos_leak)/2;
break;
case nand:
//pull up network
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
{
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
}
//pull down network
Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
//num_on_tx is the number of on tx
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
{
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
}
Ig_on /=num_states;
break;
case nor:
// num_on_tx is the number of on tx in pull up network
Ig_on += pmos_leak * fanin;//pull up network when all TXs are on.
for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)
{
Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;
}
//pull down network
for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
{
Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
}
Ig_on /=num_states;
break;
case tri:
Ig_on += (2*nmos_leak + 2*pmos_leak)/2;//enabled
Ig_on += (nmos_leak + pmos_leak)/2; //disabled upper bound of leakage power
Ig_on /=2;
break;
case tg:
Ig_on = (nmos_leak + pmos_leak)/2;
break;
default:
assert(0);
break;
}
return Ig_on;
}
double shortcircuit_simple(
double vt,
double velocity_index,
double c_in,
double c_out,
double w_nmos,
double w_pmos,
double i_on_n,
double i_on_p,
double i_on_n_in,
double i_on_p_in,
double vdd)
{
double p_short_circuit, p_short_circuit_discharge, p_short_circuit_charge, p_short_circuit_discharge_low, /*p_short_circuit_discharge_high,*/ p_short_circuit_charge_low /*,p_short_circuit_charge_high*/; //this is actually energy
double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
fo_n = i_on_n/i_on_n_in;
fo_p = i_on_p/i_on_p_in;
fanout = c_out/c_in;
beta_ratio = i_on_p/i_on_n;
vt_to_vdd_ratio = vt/vdd;
//p_short_circuit_discharge_low = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
p_short_circuit_discharge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
p_short_circuit_charge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
// double t1, t2, t3, t4, t5;
// t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
// t2=pow(velocity_index,2.0);
// t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
// t4=t1/t2/t3;
// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
///p_short_circuit_discharge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
///p_short_circuit_charge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
// t1=pow(((vdd-vt)-vt_to_vdd_ratio),1.5);
// t2=pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
// t3=t1/t2;
// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
// p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
// p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high); //harmmoic mean cannot be applied simple formulas.
p_short_circuit_discharge = p_short_circuit_discharge_low;
p_short_circuit_charge = p_short_circuit_charge_low;
p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
return (p_short_circuit);
}
double shortcircuit(
double vt,
double velocity_index,
double c_in,
double c_out,
double w_nmos,
double w_pmos,
double i_on_n,
double i_on_p,
double i_on_n_in,
double i_on_p_in,
double vdd)
{
double p_short_circuit=0, p_short_circuit_discharge=0;//, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy
double /*fo_n,*/ fo_p, fanout, beta_ratio /*,vt_to_vdd_ratio*/;
double f_alpha, k_v, e, g_v_alpha, h_v_alpha;
///fo_n = i_on_n/i_on_n_in;
fo_p = i_on_p/i_on_p_in;
fanout = 1;
beta_ratio = i_on_p/i_on_n;
///vt_to_vdd_ratio = vt/vdd;
e = 2.71828;
f_alpha = 1/(velocity_index+2) -velocity_index/(2*(velocity_index+3)) +velocity_index/(velocity_index+4)*(velocity_index/2-1);
k_v = 0.9/0.8+(vdd-vt)/0.8*log(10*(vdd-vt)/e);
g_v_alpha = (velocity_index + 1)*pow((1-velocity_index),velocity_index)*pow((1-velocity_index),velocity_index/2)/f_alpha/pow((1-velocity_index-velocity_index),(velocity_index/2+velocity_index+2));
h_v_alpha = pow(2, velocity_index)*(velocity_index+1)*pow((1-velocity_index),velocity_index)/pow((1-velocity_index-velocity_index),(velocity_index+1));
//p_short_circuit_discharge_low = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
// p_short_circuit_discharge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
// p_short_circuit_charge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
// double t1, t2, t3, t4, t5;
// t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
// t2=pow(velocity_index,2.0);
// t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
// t4=t1/t2/t3;
//
// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
//
//
// p_short_circuit_discharge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
// p_short_circuit_charge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
//
// p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
// p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high);
//
// p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
//
// p_short_circuit = p_short_circuit_discharge;
p_short_circuit_discharge = k_v*vdd*vdd*c_in*fo_p*fo_p/((vdd-vt)*g_v_alpha*fanout*beta_ratio/2/k_v + h_v_alpha*fo_p);
return (p_short_circuit);
}
//ali
double wire_resistance(double resistivity, double wire_width, double wire_thickness,
double barrier_thickness, double dishing_thickness, double alpha_scatter)
{
double resistance;
resistance = alpha_scatter * resistivity /((wire_thickness - barrier_thickness - dishing_thickness)*(wire_width - 2 * barrier_thickness));
return(resistance);
}
double wire_capacitance(double wire_width, double wire_thickness, double wire_spacing,
double ild_thickness, double miller_value, double horiz_dielectric_constant,
double vert_dielectric_constant, double fringe_cap)
{
double vertical_cap, sidewall_cap, total_cap;
vertical_cap = 2 * PERMITTIVITY_FREE_SPACE * vert_dielectric_constant * wire_width / ild_thickness;
sidewall_cap = 2 * PERMITTIVITY_FREE_SPACE * miller_value * horiz_dielectric_constant * wire_thickness / wire_spacing;
total_cap = vertical_cap + sidewall_cap + fringe_cap;
return(total_cap);
}
//CACTI3DD TSV
double tsv_resistance(double resistivity, double tsv_len, double tsv_diam, double tsv_contact_resistance)
{
double resistance;
resistance = resistivity * tsv_len / (3.1416 * (tsv_diam/2) * (tsv_diam/2)) + tsv_contact_resistance;
return(resistance);
}
double tsv_capacitance(double tsv_len, double tsv_diam, double tsv_pitch, double dielec_thickness, double liner_dielectric_constant, double depletion_width)
{
double self_cap, liner_cap, depletion_cap, lateral_coupling_cap, diagonal_coupling_cap, total_cap;
double diagonal_coupling_constant, lateral_coupling_constant;
const double e_si = PERMITTIVITY_FREE_SPACE * 11.9, PI = 3.1416;
lateral_coupling_constant = 4.1;
diagonal_coupling_constant = 5.3;
//depletion_width = 0.6; // um
liner_cap = 2 * PI * PERMITTIVITY_FREE_SPACE * liner_dielectric_constant * tsv_len / log(1 + dielec_thickness / (tsv_diam/2));
depletion_cap = 2 * PI * e_si *tsv_len / log(1 + depletion_width / (dielec_thickness + tsv_diam/2));
//self_cap = ( 1 / (1/liner_cap + 1/depletion_cap) + liner_cap ) / 2;
self_cap = 1 / (1/liner_cap + 1/depletion_cap);
if (g_ip->print_detail_debug)
{
cout<<"TSV ox cap: "<<liner_cap*1e15<<" fF"<<endl;
cout<<"TSV self cap: "<<self_cap*1e15<<" fF"<<endl;
}
lateral_coupling_cap = 0.4 * (0.225 * log(0.97 * tsv_len / tsv_diam) + 0.53) * e_si / (tsv_pitch - tsv_diam) * PI * tsv_diam * tsv_len;
diagonal_coupling_cap = 0.4 * (0.225 * log(0.97 * tsv_len / tsv_diam) + 0.53) * e_si / (1.414 * tsv_pitch - tsv_diam) * PI * tsv_diam * tsv_len;
total_cap = self_cap + lateral_coupling_constant * lateral_coupling_cap + diagonal_coupling_constant * diagonal_coupling_cap;
return(total_cap);
}
double tsv_area(double tsv_pitch)
{
return(pow(tsv_pitch,2));
}
// end ali

View file

@ -0,0 +1,305 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __BASIC_CIRCUIT_H__
#define __BASIC_CIRCUIT_H__
#include "const.h"
///#include "cacti_interface.h"
using namespace std;
#define UNI_LEAK_STACK_FACTOR 0.43
int powers (int base, int n);
bool is_pow2(int64_t val);
uint32_t _log2(uint64_t num);
int factorial(int n, int m = 1);
int combination(int n, int m);
//#define DBG
#ifdef DBG
#define PRINTDW(a);\
a;
#else
#define PRINTDW(a);\
#endif
enum Wire_placement {
outside_mat,
inside_mat,
local_wires
};
enum Htree_type {
Add_htree,
Data_in_htree,
Data_out_htree,
Search_in_htree,
Search_out_htree,
};
//CACTI3DD
enum Memorybus_type {
Row_add_path,
Col_add_path,
Data_path
/*in_network,
out_network*/
};
/*enum Part_grain {
Coarse_rank_level, //amsung 2009 3D DRAM
Fine_rank_level, //Micron HMC 2011
Coarse_bank_level, //ITRS fine TSV supported
Fine_bank_level
};*/
enum Gate_type {
nmos,
pmos,
inv,
nand,
nor,
tri,
tg
};
enum Half_net_topology {
parallel,
series
};
double logtwo (double x);
double gate_C(
double width,
double wirelength,
bool _is_dram = false,
bool _is_sram = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double gate_C_pass(
double width,
double wirelength,
bool _is_dram = false,
bool _is_sram = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double drain_C_(
double width,
int nchannel,
int stack,
int next_arg_thresh_folding_width_or_height_cell,
double fold_dimension,
bool _is_dram = false,
bool _is_sram = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double tr_R_on(
double width,
int nchannel,
int stack,
bool _is_dram = false,
bool _is_sram = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double R_to_w(
double res,
int nchannel,
bool _is_dram = false,
bool _is_sram = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double horowitz (
double inputramptime,
double tf,
double vs1,
double vs2,
int rise);
double pmos_to_nmos_sz_ratio(
bool _is_dram = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double simplified_nmos_leakage(
double nwidth,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double simplified_pmos_leakage(
double pwidth,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double simplified_nmos_Isat(
double nwidth,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double simplified_pmos_Isat(
double pwidth,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double cmos_Ileak(
double nWidth,
double pWidth,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false);
double cmos_Ig_n(
double nWidth,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr= false,
bool _is_sleep_tx = false);
double cmos_Ig_p(
double pWidth,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr= false,
bool _is_sleep_tx = false);
double cmos_Isub_leakage(
double nWidth,
double pWidth,
int fanin,
enum Gate_type g_type,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false,
enum Half_net_topology topo = series);
double cmos_Ig_leakage(
double nWidth,
double pWidth,
int fanin,
enum Gate_type g_type,
bool _is_dram = false,
bool _is_cell = false,
bool _is_wl_tr = false,
bool _is_sleep_tx = false,
enum Half_net_topology topo = series);
double shortcircuit(
double vt,
double velocity_index,
double c_in,
double c_out,
double w_nmos,
double w_pmos,
double i_on_n,
double i_on_p,
double i_on_n_in,
double i_on_p_in,
double vdd);
double shortcircuit_simple(
double vt,
double velocity_index,
double c_in,
double c_out,
double w_nmos,
double w_pmos,
double i_on_n,
double i_on_p,
double i_on_n_in,
double i_on_p_in,
double vdd);
//set power point product mask; strictly speaking this is not real point product
inline void set_pppm(
double * pppv,
double a=1,
double b=1,
double c=1,
double d=1
){
pppv[0]= a;
pppv[1]= b;
pppv[2]= c;
pppv[3]= d;
}
inline void set_sppm(
double * sppv,
double a=1,
double b=1,
double c=1,
double d=1
){
sppv[0]= a;
sppv[1]= b;
sppv[2]= c;
}
//ali
double wire_resistance(double resistivity, double wire_width, double wire_thickness,
double barrier_thickness, double dishing_thickness, double alpha_scatter);
double wire_capacitance(double wire_width, double wire_thickness, double wire_spacing,
double ild_thickness, double miller_value, double horiz_dielectric_constant,
double vert_dielectric_constant, double fringe_cap);
double tsv_resistance(double resistivity, double tsv_len, double tsv_diam, double tsv_contact_resistance);
double tsv_capacitance(double tsv_len, double tsv_diam, double tsv_pitch, double dielec_thickness, double liner_dielectric_constant, double depletion_width);
double tsv_area(double tsv_pitch);
// end ali
#endif

306
T1/TP/TP1/cacti_7/cache.cfg Normal file
View file

@ -0,0 +1,306 @@
# Cache size
//-size (bytes) 2048
//-size (bytes) 4096
//-size (bytes) 32768
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
//-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
# power gating
-Array Power Gating - "false"
-WL Power Gating - "false"
-CL Power Gating - "false"
-Bitline floating - "false"
-Interconnect Power Gating - "false"
-Power Gating Performance Loss 0.01
# Line size
//-block size (bytes) 8
-block size (bytes) 64
# To model Fully Associative cache, set associativity to zero
//-associativity 0
-associativity 2
//-associativity 4
//-associativity 8
-size (bytes) 131072
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 1
//-technology (u) 0.022
//-technology (u) 0.040
//-technology (u) 0.032
-technology (u) 0.090
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
//-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
//-Tag array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
//-Tag array peripheral type - "itrs-lop
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
-output/input bus width 512
// 300-400 in steps of 10
-operating temperature (K) 360
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 22
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
//-Wire outside mat - "global"
-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
-Print input parameters - "true"
//-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
#### Default CONFIGURATION values for baseline external IO parameters to DRAM. More details can be found in the CACTI-IO technical report (), especially Chapters 2 and 3.
# Memory Type (D3=DDR3, D4=DDR4, L=LPDDR2, W=WideIO, S=Serial). Additional memory types can be defined by the user in extio_technology.cc, along with their technology and configuration parameters.
-dram_type "DDR3"
//-dram_type "DDR4"
//-dram_type "LPDDR2"
//-dram_type "WideIO"
//-dram_type "Serial"
# Memory State (R=Read, W=Write, I=Idle or S=Sleep)
//-io state "READ"
-io state "WRITE"
//-io state "IDLE"
//-io state "SLEEP"
#Address bus timing. To alleviate the timing on the command and address bus due to high loading (shared across all memories on the channel), the interface allows for multi-cycle timing options.
//-addr_timing 0.5 //DDR
-addr_timing 1.0 //SDR (half of DQ rate)
//-addr_timing 2.0 //2T timing (One fourth of DQ rate)
//-addr_timing 3.0 // 3T timing (One sixth of DQ rate)
# Memory Density (Gbit per memory/DRAM die)
-mem_density 4 Gb //Valid values 2^n Gb
# IO frequency (MHz) (frequency of the external memory interface).
-bus_freq 800 MHz //As of current memory standards (2013), valid range 0 to 1.5 GHz for DDR3, 0 to 533 MHz for LPDDR2, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential. However this can change, and the user is free to define valid ranges based on new memory types or extending beyond existing standards for existing dram types.
# Duty Cycle (fraction of time in the Memory State defined above)
-duty_cycle 1.0 //Valid range 0 to 1.0
# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_ca 0.5 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR, 0 to 0.25 for 2T, and 0 to 0.17 for 3T
# Number of DQ pins
-num_dq 72 //Number of DQ pins. Includes ECC pins.
# Number of DQS pins. DQS is a data strobe that is sent along with a small number of data-lanes so the source synchronous timing is local to these DQ bits. Typically, 1 DQS per byte (8 DQ bits) is used. The DQS is also typucally differential, just like the CLK pin.
-num_dqs 18 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.
# Number of CA pins
-num_ca 25 //Valid range 0 to 35 pins.
# Number of CLK pins. CLK is typically a differential pair. In some cases additional CLK pairs may be used to limit the loading on the CLK pin.
-num_clk 2 //2 x differential pair. Valid values: 0/2/4.
# Number of Physical Ranks
-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per buffer/register. If multiple LRDIMMs or buffer chips exist, the analysis for capacity and power is reported per buffer/register.
# Width of the Memory Data Bus
-mem_data_width 8 //x4 or x8 or x16 or x32 memories. For WideIO upto x128.
# RTT Termination Resistance
-rtt_value 10000
# RON Termination Resistance
-ron_value 34
# Time of flight for DQ
-tflight_value
# Parameter related to MemCAD
# Number of BoBs: 1,2,3,4,5,6,
-num_bobs 1
# Memory System Capacity in GB
-capacity 80
# Number of Channel per BoB: 1,2.
-num_channels_per_bob 1
# First Metric for ordering different design points
-first metric "Cost"
#-first metric "Bandwidth"
#-first metric "Energy"
# Second Metric for ordering different design points
#-second metric "Cost"
-second metric "Bandwidth"
#-second metric "Energy"
# Third Metric for ordering different design points
#-third metric "Cost"
#-third metric "Bandwidth"
-third metric "Energy"
# Possible DIMM option to consider
#-DIMM model "JUST_UDIMM"
#-DIMM model "JUST_RDIMM"
#-DIMM model "JUST_LRDIMM"
-DIMM model "ALL"
#if channels of each bob have the same configurations
#-mirror_in_bob "T"
-mirror_in_bob "F"
#if we want to see all channels/bobs/memory configurations explored
#-verbose "T"
#-verbose "F"

View file

@ -0,0 +1,15 @@
Tech node (nm), Capacity (bytes), Number of banks, Associativity, Output width (bits), Access time (ns), Random cycle time (ns), Dynamic search energy (nJ), Dynamic read energy (nJ), Dynamic write energy (nJ), Standby leakage per bank(mW), Area (mm2), Ndwl, Ndbl, Nspd, Ndcm, Ndsam_level_1, Ndsam_level_2, Data arrary area efficiency %, Ntwl, Ntbl, Ntspd, Ntcm, Ntsam_level_1, Ntsam_level_2, Tag arrary area efficiency %,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,
90, 131072, 1, 2, 512, 1.47098, 1.86851, N/A, 0.303592, 0.615022, 63.7023, 2.24949, 2, 2, 1, 2, 2, 1, 78.3192, 2, 2, 4, 1, 8, 1, 77.9289,

BIN
T1/TP/TP1/cacti_7/cacti Executable file

Binary file not shown.

View file

@ -0,0 +1,8 @@
%module cacti
%{
/* Includes the header in the wrapper code */
#include "cacti_interface.h"
%}
/* Parse the header file to generate wrappers */
%include "cacti_interface.h"

View file

@ -0,0 +1,53 @@
TARGET = cacti
SHELL = /bin/sh
.PHONY: all depend clean
.SUFFIXES: .cc .o
ifndef NTHREADS
NTHREADS = 8
endif
LIBS =
INCS = -lm
ifeq ($(TAG),dbg)
DBG = -Wall
OPT = -ggdb -g -O0 -DNTHREADS=1 -gstabs+
else
DBG =
OPT = -g -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS)
endif
#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT)
CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT)
CXX = g++ -m64
CC = gcc -m64
SRCS = area.cc bank.cc mat.cc main.cc Ucache.cc io.cc technology.cc basic_circuit.cc parameter.cc \
decoder.cc component.cc uca.cc subarray.cc wire.cc htree2.cc extio.cc extio_technology.cc \
cacti_interface.cc router.cc nuca.cc crossbar.cc arbiter.cc powergating.cc TSV.cc memorybus.cc \
memcad.cc memcad_parameters.cc
OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
PYTHONLIB_SRCS = $(patsubst main.cc, ,$(SRCS)) obj_$(TAG)/cacti_wrap.cc
PYTHONLIB_OBJS = $(patsubst %.cc,%.o,$(PYTHONLIB_SRCS))
INCLUDES = -I /usr/include/python2.4 -I /usr/lib/python2.4/config
all: obj_$(TAG)/$(TARGET)
cp -f obj_$(TAG)/$(TARGET) $(TARGET)
obj_$(TAG)/$(TARGET) : $(OBJS)
$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
#obj_$(TAG)/%.o : %.cc
# $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
obj_$(TAG)/%.o : %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
clean:
-rm -f *.o _cacti.so cacti.py $(TARGET)

View file

@ -0,0 +1,174 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include <time.h>
#include <math.h>
#include "area.h"
#include "basic_circuit.h"
#include "component.h"
#include "const.h"
#include "parameter.h"
#include "cacti_interface.h"
#include "Ucache.h"
#include <pthread.h>
#include <iostream>
#include <algorithm>
using namespace std;
bool mem_array::lt(const mem_array * m1, const mem_array * m2)
{
if (m1->Nspd < m2->Nspd) return true;
else if (m1->Nspd > m2->Nspd) return false;
else if (m1->Ndwl < m2->Ndwl) return true;
else if (m1->Ndwl > m2->Ndwl) return false;
else if (m1->Ndbl < m2->Ndbl) return true;
else if (m1->Ndbl > m2->Ndbl) return false;
else if (m1->deg_bl_muxing < m2->deg_bl_muxing) return true;
else if (m1->deg_bl_muxing > m2->deg_bl_muxing) return false;
else if (m1->Ndsam_lev_1 < m2->Ndsam_lev_1) return true;
else if (m1->Ndsam_lev_1 > m2->Ndsam_lev_1) return false;
else if (m1->Ndsam_lev_2 < m2->Ndsam_lev_2) return true;
else return false;
}
void uca_org_t::find_delay()
{
mem_array * data_arr = data_array2;
mem_array * tag_arr = tag_array2;
// check whether it is a regular cache or scratch ram
if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
{
access_time = data_arr->access_time;
}
// Both tag and data lookup happen in parallel
// and the entire set is sent over the data array h-tree without
// waiting for the way-select signal --TODO add the corresponding
// power overhead Nav
else if (g_ip->fast_access == true)
{
access_time = MAX(tag_arr->access_time, data_arr->access_time);
}
// Tag is accessed first. On a hit, way-select signal along with the
// address is sent to read/write the appropriate block in the data
// array
else if (g_ip->is_seq_acc == true)
{
access_time = tag_arr->access_time + data_arr->access_time;
}
// Normal access: tag array access and data array access happen in parallel.
// But, the data array will wait for the way-select and transfer only the
// appropriate block over the h-tree.
else
{
access_time = MAX(tag_arr->access_time + data_arr->delay_senseamp_mux_decoder,
data_arr->delay_before_subarray_output_driver) +
data_arr->delay_from_subarray_output_driver_to_output;
}
}
void uca_org_t::find_energy()
{
if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache)
power = data_array2->power + tag_array2->power;
else
power = data_array2->power;
}
void uca_org_t::find_area()
{
if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)//(g_ip->is_cache == false)
{
cache_ht = data_array2->height;
cache_len = data_array2->width;
}
else
{
cache_ht = MAX(tag_array2->height, data_array2->height);
cache_len = tag_array2->width + data_array2->width;
}
area = cache_ht * cache_len;
}
void uca_org_t::adjust_area()
{
double area_adjust;
if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
{
if (data_array2->area_efficiency/100.0<0.2)
{
//area_adjust = sqrt(area/(area*(data_array2->area_efficiency/100.0)/0.2));
area_adjust = sqrt(0.2/(data_array2->area_efficiency/100.0));
cache_ht = cache_ht/area_adjust;
cache_len = cache_len/area_adjust;
}
}
area = cache_ht * cache_len;
}
void uca_org_t::find_cyc()
{
if ((g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache == false)
{
cycle_time = data_array2->cycle_time;
}
else
{
cycle_time = MAX(tag_array2->cycle_time,
data_array2->cycle_time);
}
}
uca_org_t :: uca_org_t()
:tag_array2(0),
data_array2(0)
{
}
void uca_org_t :: cleanup()
{
if (data_array2!=0)
delete data_array2;
if (tag_array2!=0)
delete tag_array2;
}

View file

@ -0,0 +1,904 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __CACTI_INTERFACE_H__
#define __CACTI_INTERFACE_H__
#include <map>
#include <string>
#include <vector>
#include <list>
#include <iostream>
#include "const.h"
using namespace std;
class min_values_t;
class mem_array;
class uca_org_t;
class powerComponents
{
public:
double dynamic;
double leakage;
double gate_leakage;
double short_circuit;
double longer_channel_leakage;
powerComponents() : dynamic(0), leakage(0), gate_leakage(0), short_circuit(0), longer_channel_leakage(0) { }
powerComponents(const powerComponents & obj) { *this = obj; }
powerComponents & operator=(const powerComponents & rhs)
{
dynamic = rhs.dynamic;
leakage = rhs.leakage;
gate_leakage = rhs.gate_leakage;
short_circuit = rhs.short_circuit;
longer_channel_leakage = rhs.longer_channel_leakage;
return *this;
}
void reset() { dynamic = 0; leakage = 0; gate_leakage = 0; short_circuit = 0;longer_channel_leakage = 0;}
friend powerComponents operator+(const powerComponents & x, const powerComponents & y);
friend powerComponents operator*(const powerComponents & x, double const * const y);
};
class powerDef
{
public:
powerComponents readOp;
powerComponents writeOp;
powerComponents searchOp;//: for CAM and FA
powerDef() : readOp(), writeOp(), searchOp() { }
void reset() { readOp.reset(); writeOp.reset(); searchOp.reset();}
friend powerDef operator+(const powerDef & x, const powerDef & y);
friend powerDef operator*(const powerDef & x, double const * const y);
};
enum Wire_type
{
Global /* gloabl wires with repeaters */,
Global_5 /* 5% delay penalty */,
Global_10 /* 10% delay penalty */,
Global_20 /* 20% delay penalty */,
Global_30 /* 30% delay penalty */,
Low_swing /* differential low power wires with high area overhead */,
Semi_global /* mid-level wires with repeaters*/,
Full_swing /* models all global wires with different latencies (Global_x )*/,
Transmission /* tranmission lines with high area overhead */,
Optical /* optical wires */,
Invalid_wtype
};
enum TSV_type
{
Fine, /*ITRS high density*/
Coarse /*Industry reported in 2010*/
};
// ali
enum Mem_IO_type
{
DDR3,
DDR4,
LPDDR2,
WideIO,
Low_Swing_Diff,
Serial
};
enum Mem_DIMM
{
UDIMM,
RDIMM,
LRDIMM
};
enum Mem_state
{
READ,
WRITE,
IDLE,
SLEEP
};
enum Mem_ECC
{
NO_ECC,
SECDED, // single error correction, double error detection
CHIP_KILL
};
enum DIMM_Model
{
JUST_UDIMM,JUST_RDIMM,JUST_LRDIMM,ALL
};
enum MemCad_metrics
{
Bandwidth, Energy, Cost
};
/**
enum BoB_LINK
{
PARALLEL, // e.g. Intel SMB c104
SERIAL // e.g. Intel SMB 7510, IBM AMB
};
**/
// end ali
class InputParameter
{
public:
InputParameter();
void parse_cfg(const string & infile);
bool error_checking(); // return false if the input parameters are problematic
void display_ip();
unsigned int cache_sz; // in bytes
unsigned int line_sz;
unsigned int assoc;
unsigned int nbanks;
unsigned int out_w;// == nr_bits_out
bool specific_tag;
unsigned int tag_w;
unsigned int access_mode;
unsigned int obj_func_dyn_energy;
unsigned int obj_func_dyn_power;
unsigned int obj_func_leak_power;
unsigned int obj_func_cycle_t;
double F_sz_nm; // feature size in nm
double F_sz_um; // feature size in um
unsigned int num_rw_ports;
unsigned int num_rd_ports;
unsigned int num_wr_ports;
unsigned int num_se_rd_ports; // number of single ended read ports
unsigned int num_search_ports; // : number of search ports for CAM
bool is_main_mem;
bool is_3d_mem;
bool print_detail_debug;
bool is_cache;
bool pure_ram;
bool pure_cam;
bool rpters_in_htree; // if there are repeaters in htree segment
unsigned int ver_htree_wires_over_array;
unsigned int broadcast_addr_din_over_ver_htrees;
unsigned int temp;
unsigned int ram_cell_tech_type;
unsigned int peri_global_tech_type;
unsigned int data_arr_ram_cell_tech_type;
unsigned int data_arr_peri_global_tech_type;
unsigned int tag_arr_ram_cell_tech_type;
unsigned int tag_arr_peri_global_tech_type;
unsigned int burst_len;
unsigned int int_prefetch_w;
unsigned int page_sz_bits;
unsigned int num_die_3d;
unsigned int burst_depth;
unsigned int io_width;
unsigned int sys_freq_MHz;
unsigned int tsv_is_subarray_type;
unsigned int tsv_os_bank_type;
unsigned int TSV_proj_type;
int partition_gran;
unsigned int num_tier_row_sprd;
unsigned int num_tier_col_sprd;
bool fine_gran_bank_lvl;
unsigned int ic_proj_type; // interconnect_projection_type
unsigned int wire_is_mat_type; // wire_inside_mat_type
unsigned int wire_os_mat_type; // wire_outside_mat_type
enum Wire_type wt;
int force_wiretype;
bool print_input_args;
unsigned int nuca_cache_sz; // TODO
int ndbl, ndwl, nspd, ndsam1, ndsam2, ndcm;
bool force_cache_config;
int cache_level;
int cores;
int nuca_bank_count;
int force_nuca_bank;
int delay_wt, dynamic_power_wt, leakage_power_wt,
cycle_time_wt, area_wt;
int delay_wt_nuca, dynamic_power_wt_nuca, leakage_power_wt_nuca,
cycle_time_wt_nuca, area_wt_nuca;
int delay_dev, dynamic_power_dev, leakage_power_dev,
cycle_time_dev, area_dev;
int delay_dev_nuca, dynamic_power_dev_nuca, leakage_power_dev_nuca,
cycle_time_dev_nuca, area_dev_nuca;
int ed; //ED or ED2 optimization
int nuca;
bool fast_access;
unsigned int block_sz; // bytes
unsigned int tag_assoc;
unsigned int data_assoc;
bool is_seq_acc;
bool fully_assoc;
unsigned int nsets; // == number_of_sets
int print_detail;
bool add_ecc_b_;
//parameters for design constraint
double throughput;
double latency;
bool pipelinable;
int pipeline_stages;
int per_stage_vector;
bool with_clock_grid;
bool array_power_gated;
bool bitline_floating;
bool wl_power_gated;
bool cl_power_gated;
bool interconect_power_gated;
bool power_gating;
double perfloss;
bool cl_vertical;
// Parameters related to off-chip I/O
double addr_timing, duty_cycle, mem_density, bus_bw, activity_dq, activity_ca, bus_freq;
int mem_data_width, num_mem_dq, num_clk, num_ca, num_dqs, num_dq;
double rtt_value, ron_value, tflight_value; //FIXME
Mem_state iostate;
///char iostate, dram_ecc, io_type;
Mem_ECC dram_ecc;
Mem_IO_type io_type;
Mem_DIMM dram_dimm;
int num_bobs; // BoB is buffer-on-board such as Intel SMB c102
int capacity; // in GB
int num_channels_per_bob; // 1 means no bob
MemCad_metrics first_metric;
MemCad_metrics second_metric;
MemCad_metrics third_metric;
DIMM_Model dimm_model;
bool low_power_permitted; // Not yet implemented. It determines acceptable VDDs.
double load; // between 0 to 1
double row_buffer_hit_rate;
double rd_2_wr_ratio;
bool same_bw_in_bob; // true if all the channels in the bob have the same bandwidth.
bool mirror_in_bob;// true if all the channels in the bob have the same configs
bool total_power; // false means just considering I/O Power
bool verbose;
};
typedef struct{
int Ndwl;
int Ndbl;
double Nspd;
int deg_bl_muxing;
int Ndsam_lev_1;
int Ndsam_lev_2;
int number_activated_mats_horizontal_direction;
int number_subbanks;
int page_size_in_bits;
double delay_route_to_bank;
double delay_crossbar;
double delay_addr_din_horizontal_htree;
double delay_addr_din_vertical_htree;
double delay_row_predecode_driver_and_block;
double delay_row_decoder;
double delay_bitlines;
double delay_sense_amp;
double delay_subarray_output_driver;
double delay_bit_mux_predecode_driver_and_block;
double delay_bit_mux_decoder;
double delay_senseamp_mux_lev_1_predecode_driver_and_block;
double delay_senseamp_mux_lev_1_decoder;
double delay_senseamp_mux_lev_2_predecode_driver_and_block;
double delay_senseamp_mux_lev_2_decoder;
double delay_input_htree;
double delay_output_htree;
double delay_dout_vertical_htree;
double delay_dout_horizontal_htree;
double delay_comparator;
double access_time;
double cycle_time;
double multisubbank_interleave_cycle_time;
double delay_request_network;
double delay_inside_mat;
double delay_reply_network;
double trcd;
double cas_latency;
double precharge_delay;
powerDef power_routing_to_bank;
powerDef power_addr_input_htree;
powerDef power_data_input_htree;
powerDef power_data_output_htree;
powerDef power_addr_horizontal_htree;
powerDef power_datain_horizontal_htree;
powerDef power_dataout_horizontal_htree;
powerDef power_addr_vertical_htree;
powerDef power_datain_vertical_htree;
powerDef power_row_predecoder_drivers;
powerDef power_row_predecoder_blocks;
powerDef power_row_decoders;
powerDef power_bit_mux_predecoder_drivers;
powerDef power_bit_mux_predecoder_blocks;
powerDef power_bit_mux_decoders;
powerDef power_senseamp_mux_lev_1_predecoder_drivers;
powerDef power_senseamp_mux_lev_1_predecoder_blocks;
powerDef power_senseamp_mux_lev_1_decoders;
powerDef power_senseamp_mux_lev_2_predecoder_drivers;
powerDef power_senseamp_mux_lev_2_predecoder_blocks;
powerDef power_senseamp_mux_lev_2_decoders;
powerDef power_bitlines;
powerDef power_sense_amps;
powerDef power_prechg_eq_drivers;
powerDef power_output_drivers_at_subarray;
powerDef power_dataout_vertical_htree;
powerDef power_comparators;
powerDef power_crossbar;
powerDef total_power;
double area;
double all_banks_height;
double all_banks_width;
double bank_height;
double bank_width;
double subarray_memory_cell_area_height;
double subarray_memory_cell_area_width;
double mat_height;
double mat_width;
double routing_area_height_within_bank;
double routing_area_width_within_bank;
double area_efficiency;
// double perc_power_dyn_routing_to_bank;
// double perc_power_dyn_addr_horizontal_htree;
// double perc_power_dyn_datain_horizontal_htree;
// double perc_power_dyn_dataout_horizontal_htree;
// double perc_power_dyn_addr_vertical_htree;
// double perc_power_dyn_datain_vertical_htree;
// double perc_power_dyn_row_predecoder_drivers;
// double perc_power_dyn_row_predecoder_blocks;
// double perc_power_dyn_row_decoders;
// double perc_power_dyn_bit_mux_predecoder_drivers;
// double perc_power_dyn_bit_mux_predecoder_blocks;
// double perc_power_dyn_bit_mux_decoders;
// double perc_power_dyn_senseamp_mux_lev_1_predecoder_drivers;
// double perc_power_dyn_senseamp_mux_lev_1_predecoder_blocks;
// double perc_power_dyn_senseamp_mux_lev_1_decoders;
// double perc_power_dyn_senseamp_mux_lev_2_predecoder_drivers;
// double perc_power_dyn_senseamp_mux_lev_2_predecoder_blocks;
// double perc_power_dyn_senseamp_mux_lev_2_decoders;
// double perc_power_dyn_bitlines;
// double perc_power_dyn_sense_amps;
// double perc_power_dyn_prechg_eq_drivers;
// double perc_power_dyn_subarray_output_drivers;
// double perc_power_dyn_dataout_vertical_htree;
// double perc_power_dyn_comparators;
// double perc_power_dyn_crossbar;
// double perc_power_dyn_spent_outside_mats;
// double perc_power_leak_routing_to_bank;
// double perc_power_leak_addr_horizontal_htree;
// double perc_power_leak_datain_horizontal_htree;
// double perc_power_leak_dataout_horizontal_htree;
// double perc_power_leak_addr_vertical_htree;
// double perc_power_leak_datain_vertical_htree;
// double perc_power_leak_row_predecoder_drivers;
// double perc_power_leak_row_predecoder_blocks;
// double perc_power_leak_row_decoders;
// double perc_power_leak_bit_mux_predecoder_drivers;
// double perc_power_leak_bit_mux_predecoder_blocks;
// double perc_power_leak_bit_mux_decoders;
// double perc_power_leak_senseamp_mux_lev_1_predecoder_drivers;
// double perc_power_leak_senseamp_mux_lev_1_predecoder_blocks;
// double perc_power_leak_senseamp_mux_lev_1_decoders;
// double perc_power_leak_senseamp_mux_lev_2_predecoder_drivers;
// double perc_power_leak_senseamp_mux_lev_2_predecoder_blocks;
// double perc_power_leak_senseamp_mux_lev_2_decoders;
// double perc_power_leak_bitlines;
// double perc_power_leak_sense_amps;
// double perc_power_leak_prechg_eq_drivers;
// double perc_power_leak_subarray_output_drivers;
// double perc_power_leak_dataout_vertical_htree;
// double perc_power_leak_comparators;
// double perc_power_leak_crossbar;
// double perc_leak_mats;
// double perc_active_mats;
double refresh_power;
double dram_refresh_period;
double dram_array_availability;
double dyn_read_energy_from_closed_page;
double dyn_read_energy_from_open_page;
double leak_power_subbank_closed_page;
double leak_power_subbank_open_page;
double leak_power_request_and_reply_networks;
double activate_energy;
double read_energy;
double write_energy;
double precharge_energy;
} results_mem_array;
class uca_org_t
{
public:
mem_array * tag_array2;
mem_array * data_array2;
double access_time;
double cycle_time;
double area;
double area_efficiency;
powerDef power;
double leak_power_with_sleep_transistors_in_mats;
double cache_ht;
double cache_len;
char file_n[100];
double vdd_periph_global;
bool valid;
results_mem_array tag_array;
results_mem_array data_array;
uca_org_t();
void find_delay();
void find_energy();
void find_area();
void find_cyc();
void adjust_area();//for McPAT only to adjust routing overhead
void cleanup();
~uca_org_t(){};
};
class IO_org_t
{
public:
double io_area;
double io_timing_margin;
double io_voltage_margin;
double io_dynamic_power;
double io_phy_power;
double io_wakeup_time;
double io_termination_power;
IO_org_t():io_area(0),io_timing_margin(0),io_voltage_margin(0)
,io_dynamic_power(0),io_phy_power(0),io_wakeup_time(0),io_termination_power(0)
{}
};
void reconfigure(InputParameter *local_interface, uca_org_t *fin_res);
uca_org_t cacti_interface(const string & infile_name);
//McPAT's plain interface, please keep !!!
uca_org_t cacti_interface(InputParameter * const local_interface);
//McPAT's plain interface, please keep !!!
uca_org_t init_interface(InputParameter * const local_interface);
//McPAT's plain interface, please keep !!!
uca_org_t cacti_interface(
int cache_size,
int line_size,
int associativity,
int rw_ports,
int excl_read_ports,
int excl_write_ports,
int single_ended_read_ports,
int search_ports,
int banks,
double tech_node,
int output_width,
int specific_tag,
int tag_width,
int access_mode,
int cache,
int main_mem,
int obj_func_delay,
int obj_func_dynamic_power,
int obj_func_leakage_power,
int obj_func_cycle_time,
int obj_func_area,
int dev_func_delay,
int dev_func_dynamic_power,
int dev_func_leakage_power,
int dev_func_area,
int dev_func_cycle_time,
int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
int temp,
int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
int data_arr_ram_cell_tech_flavor_in,
int data_arr_peri_global_tech_flavor_in,
int tag_arr_ram_cell_tech_flavor_in,
int tag_arr_peri_global_tech_flavor_in,
int interconnect_projection_type_in,
int wire_inside_mat_type_in,
int wire_outside_mat_type_in,
int REPEATERS_IN_HTREE_SEGMENTS_in,
int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
int PAGE_SIZE_BITS_in,
int BURST_LENGTH_in,
int INTERNAL_PREFETCH_WIDTH_in,
int force_wiretype,
int wiretype,
int force_config,
int ndwl,
int ndbl,
int nspd,
int ndcm,
int ndsam1,
int ndsam2,
int ecc);
// int cache_size,
// int line_size,
// int associativity,
// int rw_ports,
// int excl_read_ports,
// int excl_write_ports,
// int single_ended_read_ports,
// int banks,
// double tech_node,
// int output_width,
// int specific_tag,
// int tag_width,
// int access_mode,
// int cache,
// int main_mem,
// int obj_func_delay,
// int obj_func_dynamic_power,
// int obj_func_leakage_power,
// int obj_func_area,
// int obj_func_cycle_time,
// int dev_func_delay,
// int dev_func_dynamic_power,
// int dev_func_leakage_power,
// int dev_func_area,
// int dev_func_cycle_time,
// int temp,
// int data_arr_ram_cell_tech_flavor_in,
// int data_arr_peri_global_tech_flavor_in,
// int tag_arr_ram_cell_tech_flavor_in,
// int tag_arr_peri_global_tech_flavor_in,
// int interconnect_projection_type_in,
// int wire_inside_mat_type_in,
// int wire_outside_mat_type_in,
// int REPEATERS_IN_HTREE_SEGMENTS_in,
// int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
// int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
//// double MAXAREACONSTRAINT_PERC_in,
//// double MAXACCTIMECONSTRAINT_PERC_in,
//// double MAX_PERC_DIFF_IN_DELAY_FROM_BEST_DELAY_REPEATER_SOLUTION_in,
// int PAGE_SIZE_BITS_in,
// int BURST_LENGTH_in,
// int INTERNAL_PREFETCH_WIDTH_in);
//Naveen's interface
uca_org_t cacti_interface(
int cache_size,
int line_size,
int associativity,
int rw_ports,
int excl_read_ports,
int excl_write_ports,
int single_ended_read_ports,
int banks,
double tech_node,
int page_sz,
int burst_length,
int pre_width,
int output_width,
int specific_tag,
int tag_width,
int access_mode, //0 normal, 1 seq, 2 fast
int cache, //scratch ram or cache
int main_mem,
int obj_func_delay,
int obj_func_dynamic_power,
int obj_func_leakage_power,
int obj_func_area,
int obj_func_cycle_time,
int dev_func_delay,
int dev_func_dynamic_power,
int dev_func_leakage_power,
int dev_func_area,
int dev_func_cycle_time,
int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
int temp,
int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
int data_arr_ram_cell_tech_flavor_in,
int data_arr_peri_global_tech_flavor_in,
int tag_arr_ram_cell_tech_flavor_in,
int tag_arr_peri_global_tech_flavor_in,
int interconnect_projection_type_in, // 0 - aggressive, 1 - normal
int wire_inside_mat_type_in,
int wire_outside_mat_type_in,
int is_nuca, // 0 - UCA, 1 - NUCA
int core_count,
int cache_level, // 0 - L2, 1 - L3
int nuca_bank_count,
int nuca_obj_func_delay,
int nuca_obj_func_dynamic_power,
int nuca_obj_func_leakage_power,
int nuca_obj_func_area,
int nuca_obj_func_cycle_time,
int nuca_dev_func_delay,
int nuca_dev_func_dynamic_power,
int nuca_dev_func_leakage_power,
int nuca_dev_func_area,
int nuca_dev_func_cycle_time,
int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported
int p_input);
//CACTI3DD interface
uca_org_t cacti_interface(
int cache_size,
int line_size,
int associativity,
int rw_ports,
int excl_read_ports,// para5
int excl_write_ports,
int single_ended_read_ports,
int search_ports,
int banks,
double tech_node,//para10
int output_width,
int specific_tag,
int tag_width,
int access_mode,
int cache, //para15
int main_mem,
int obj_func_delay,
int obj_func_dynamic_power,
int obj_func_leakage_power,
int obj_func_cycle_time, //para20
int obj_func_area,
int dev_func_delay,
int dev_func_dynamic_power,
int dev_func_leakage_power,
int dev_func_area, //para25
int dev_func_cycle_time,
int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
int temp,
int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
int data_arr_ram_cell_tech_flavor_in,//para30
int data_arr_peri_global_tech_flavor_in,
int tag_arr_ram_cell_tech_flavor_in,
int tag_arr_peri_global_tech_flavor_in,
int interconnect_projection_type_in,
int wire_inside_mat_type_in,//para35
int wire_outside_mat_type_in,
int REPEATERS_IN_HTREE_SEGMENTS_in,
int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
int PAGE_SIZE_BITS_in,//para40
int BURST_LENGTH_in,
int INTERNAL_PREFETCH_WIDTH_in,
int force_wiretype,
int wiretype,
int force_config,//para45
int ndwl,
int ndbl,
int nspd,
int ndcm,
int ndsam1,//para50
int ndsam2,
int ecc,
int is_3d_dram,
int burst_depth,
int IO_width,
int sys_freq,
int debug_detail,
int num_dies,
int tsv_gran_is_subarray,
int tsv_gran_os_bank,
int num_tier_row_sprd,
int num_tier_col_sprd,
int partition_level);
class mem_array
{
public:
int Ndcm;
int Ndwl;
int Ndbl;
double Nspd;
int deg_bl_muxing;
int Ndsam_lev_1;
int Ndsam_lev_2;
double access_time;
double cycle_time;
double multisubbank_interleave_cycle_time;
double area_ram_cells;
double area;
powerDef power;
double delay_senseamp_mux_decoder;
double delay_before_subarray_output_driver;
double delay_from_subarray_output_driver_to_output;
double height;
double width;
double mat_height;
double mat_length;
double subarray_length;
double subarray_height;
double delay_route_to_bank,
delay_input_htree,
delay_row_predecode_driver_and_block,
delay_row_decoder,
delay_bitlines,
delay_sense_amp,
delay_subarray_output_driver,
delay_dout_htree,
delay_comparator,
delay_matchlines;
//CACTI3DD 3d stats
double delay_row_activate_net,
delay_local_wordline,
delay_column_access_net,
delay_column_predecoder,
delay_column_decoder,
delay_column_selectline,
delay_datapath_net,
delay_global_data,
delay_local_data_and_drv,
delay_data_buffer;
double energy_row_activate_net,
energy_row_predecode_driver_and_block,
energy_row_decoder,
energy_local_wordline,
energy_bitlines,
energy_sense_amp,
energy_column_access_net,
energy_column_predecoder,
energy_column_decoder,
energy_column_selectline,
energy_datapath_net,
energy_global_data,
energy_local_data_and_drv,
energy_data_buffer,
energy_subarray_output_driver;
double all_banks_height,
all_banks_width,
area_efficiency;
powerDef power_routing_to_bank;
powerDef power_addr_input_htree;
powerDef power_data_input_htree;
powerDef power_data_output_htree;
powerDef power_htree_in_search;
powerDef power_htree_out_search;
powerDef power_row_predecoder_drivers;
powerDef power_row_predecoder_blocks;
powerDef power_row_decoders;
powerDef power_bit_mux_predecoder_drivers;
powerDef power_bit_mux_predecoder_blocks;
powerDef power_bit_mux_decoders;
powerDef power_senseamp_mux_lev_1_predecoder_drivers;
powerDef power_senseamp_mux_lev_1_predecoder_blocks;
powerDef power_senseamp_mux_lev_1_decoders;
powerDef power_senseamp_mux_lev_2_predecoder_drivers;
powerDef power_senseamp_mux_lev_2_predecoder_blocks;
powerDef power_senseamp_mux_lev_2_decoders;
powerDef power_bitlines;
powerDef power_sense_amps;
powerDef power_prechg_eq_drivers;
powerDef power_output_drivers_at_subarray;
powerDef power_dataout_vertical_htree;
powerDef power_comparators;
powerDef power_cam_bitline_precharge_eq_drv;
powerDef power_searchline;
powerDef power_searchline_precharge;
powerDef power_matchlines;
powerDef power_matchline_precharge;
powerDef power_matchline_to_wordline_drv;
min_values_t *arr_min;
enum Wire_type wt;
// dram stats
double activate_energy, read_energy, write_energy, precharge_energy,
refresh_power, leak_power_subbank_closed_page, leak_power_subbank_open_page,
leak_power_request_and_reply_networks;
double precharge_delay;
//Power-gating stats
double array_leakage;
double wl_leakage;
double cl_leakage;
double sram_sleep_tx_width, wl_sleep_tx_width, cl_sleep_tx_width;
double sram_sleep_tx_area, wl_sleep_tx_area, cl_sleep_tx_area;
double sram_sleep_wakeup_latency, wl_sleep_wakeup_latency, cl_sleep_wakeup_latency, bl_floating_wakeup_latency;
double sram_sleep_wakeup_energy, wl_sleep_wakeup_energy, cl_sleep_wakeup_energy, bl_floating_wakeup_energy;
int num_active_mats;
int num_submarray_mats;
static bool lt(const mem_array * m1, const mem_array * m2);
//CACTI3DD 3d dram stats
double t_RCD, t_RAS, t_RC, t_CAS, t_RP, t_RRD;
double activate_power, read_power, write_power, peak_read_power;
int num_row_subarray, num_col_subarray;
double delay_TSV_tot, area_TSV_tot, dyn_pow_TSV_tot, dyn_pow_TSV_per_access;
unsigned int num_TSV_tot;
double area_lwl_drv, area_row_predec_dec, area_col_predec_dec,
area_subarray, area_bus, area_address_bus, area_data_bus, area_data_drv, area_IOSA, area_sense_amp;
};
#endif

View file

@ -0,0 +1,237 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include <assert.h>
#include <iostream>
#include <math.h>
#include "bank.h"
#include "component.h"
#include "decoder.h"
using namespace std;
Component::Component()
:area(), power(), rt_power(),delay(0)
{
}
Component::~Component()
{
}
double Component::compute_diffusion_width(int num_stacked_in, int num_folded_tr)
{
double w_poly = g_ip->F_sz_um;
double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
double total_diff_w = 2 * spacing_poly_to_poly + // for both source and drain
num_stacked_in * w_poly +
(num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
if (num_folded_tr > 1)
{
total_diff_w += (num_folded_tr - 2) * 2 * spacing_poly_to_poly +
(num_folded_tr - 1) * num_stacked_in * w_poly +
(num_folded_tr - 1) * (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
}
return total_diff_w;
}
double Component::compute_gate_area(
int gate_type,
int num_inputs,
double w_pmos,
double w_nmos,
double h_gate)
{
if (w_pmos <= 0.0 || w_nmos <= 0.0)
{
return 0.0;
}
double w_folded_pmos, w_folded_nmos;
int num_folded_pmos, num_folded_nmos;
double total_ndiff_w, total_pdiff_w;
Area gate;
double h_tr_region = h_gate - 2 * g_tp.HPOWERRAIL;
double ratio_p_to_n = w_pmos / (w_pmos + w_nmos);
if (ratio_p_to_n >= 1 || ratio_p_to_n <= 0)
{
return 0.0;
}
w_folded_pmos = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * ratio_p_to_n;
w_folded_nmos = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * (1 - ratio_p_to_n);
assert(w_folded_pmos > 0);
num_folded_pmos = (int) (ceil(w_pmos / w_folded_pmos));
num_folded_nmos = (int) (ceil(w_nmos / w_folded_nmos));
switch (gate_type)
{
case INV:
total_ndiff_w = compute_diffusion_width(1, num_folded_nmos);
total_pdiff_w = compute_diffusion_width(1, num_folded_pmos);
break;
case NOR:
total_ndiff_w = compute_diffusion_width(1, num_inputs * num_folded_nmos);
total_pdiff_w = compute_diffusion_width(num_inputs, num_folded_pmos);
break;
case NAND:
total_ndiff_w = compute_diffusion_width(num_inputs, num_folded_nmos);
total_pdiff_w = compute_diffusion_width(1, num_inputs * num_folded_pmos);
break;
default:
cout << "Unknown gate type: " << gate_type << endl;
exit(1);
}
gate.w = MAX(total_ndiff_w, total_pdiff_w);
if (w_folded_nmos > w_nmos)
{
//means that the height of the gate can
//be made smaller than the input height specified, so calculate the height of the gate.
gate.h = w_nmos + w_pmos + g_tp.MIN_GAP_BET_P_AND_N_DIFFS + 2 * g_tp.HPOWERRAIL;
}
else
{
gate.h = h_gate;
}
return gate.get_area();
}
double Component::compute_tr_width_after_folding(
double input_width,
double threshold_folding_width)
{//This is actually the width of the cell not the width of a device.
//The width of a cell and the width of a device is orthogonal.
if (input_width <= 0)
{
return 0;
}
int num_folded_tr = (int) (ceil(input_width / threshold_folding_width));
double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
double width_poly = g_ip->F_sz_um;
double total_diff_width = num_folded_tr * width_poly + (num_folded_tr + 1) * spacing_poly_to_poly;
return total_diff_width;
}
double Component::height_sense_amplifier(double pitch_sense_amp)
{
// compute the height occupied by all PMOS transistors
double h_pmos_tr = compute_tr_width_after_folding(g_tp.w_sense_p, pitch_sense_amp) * 2 +
compute_tr_width_after_folding(g_tp.w_iso, pitch_sense_amp) +
2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
// compute the height occupied by all NMOS transistors
double h_nmos_tr = compute_tr_width_after_folding(g_tp.w_sense_n, pitch_sense_amp) * 2 +
compute_tr_width_after_folding(g_tp.w_sense_en, pitch_sense_amp) +
2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
// compute total height by considering gap between the p and n diffusion areas
return h_pmos_tr + h_nmos_tr + g_tp.MIN_GAP_BET_P_AND_N_DIFFS;
}
int Component::logical_effort(
int num_gates_min,
double g,
double F,
double * w_n,
double * w_p,
double C_load,
double p_to_n_sz_ratio,
bool is_dram_,
bool is_wl_tr_,
double max_w_nmos)
{
int num_gates = (int) (log(F) / log(fopt));
// check if num_gates is odd. if so, add 1 to make it even
num_gates+= (num_gates % 2) ? 1 : 0;
num_gates = MAX(num_gates, num_gates_min);
// recalculate the effective fanout of each stage
double f = pow(F, 1.0 / num_gates);
int i = num_gates - 1;
double C_in = C_load / f;
w_n[i] = (1.0 / (1.0 + p_to_n_sz_ratio)) * C_in / gate_C(1, 0, is_dram_, false, is_wl_tr_);
w_n[i] = MAX(w_n[i], g_tp.min_w_nmos_);
w_p[i] = p_to_n_sz_ratio * w_n[i];
if (w_n[i] > max_w_nmos) // && !g_ip->is_3d_mem)
{
double C_ld = gate_C((1 + p_to_n_sz_ratio) * max_w_nmos, 0, is_dram_, false, is_wl_tr_);
F = g * C_ld / gate_C(w_n[0] + w_p[0], 0, is_dram_, false, is_wl_tr_);
num_gates = (int) (log(F) / log(fopt)) + 1;
num_gates+= (num_gates % 2) ? 1 : 0;
num_gates = MAX(num_gates, num_gates_min);
f = pow(F, 1.0 / (num_gates - 1));
i = num_gates - 1;
w_n[i] = max_w_nmos;
w_p[i] = p_to_n_sz_ratio * w_n[i];
}
for (i = num_gates - 2; i >= 1; i--)
{
w_n[i] = MAX(w_n[i+1] / f, g_tp.min_w_nmos_);
w_p[i] = p_to_n_sz_ratio * w_n[i];
}
assert(num_gates <= MAX_NUMBER_GATES_STAGE);
return num_gates;
}

View file

@ -0,0 +1,84 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __COMPONENT_H__
#define __COMPONENT_H__
#include "parameter.h"
#include "area.h"
using namespace std;
class Crossbar;
class Bank;
class Component
{
public:
Component();
~Component();
Area area;
powerDef power,rt_power;
double delay;
double cycle_time;
double compute_gate_area(
int gate_type,
int num_inputs,
double w_pmos,
double w_nmos,
double h_gate);
double compute_tr_width_after_folding(double input_width, double threshold_folding_width);
double height_sense_amplifier(double pitch_sense_amp);
protected:
int logical_effort(
int num_gates_min,
double g,
double F,
double * w_n,
double * w_p,
double C_load,
double p_to_n_sz_ratio,
bool is_dram_,
bool is_wl_tr_,
double max_w_nmos);
private:
double compute_diffusion_width(int num_stacked_in, int num_folded_tr);
};
#endif

273
T1/TP/TP1/cacti_7/const.h Normal file
View file

@ -0,0 +1,273 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __CONST_H__
#define __CONST_H__
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
/* The following are things you might want to change
* when compiling
*/
/*
* Address bits in a word, and number of output bits from the cache
*/
/*
was: #define ADDRESS_BITS 32
now: 42 bits as in the Power4
This is 36 bits in Pentium 4
and 40 bits in Opteron.
*/
const int ADDRESS_BITS = 42;
/*dt: In addition to the tag bits, the tags also include 1 valid bit, 1 dirty bit, 2 bits for a 4-state
cache coherency protocoll (MESI), 1 bit for MRU (change this to log(ways) for full LRU).
So in total we have 1 + 1 + 2 + 1 = 5 */
const int EXTRA_TAG_BITS = 5;
/* limits on the various N parameters */
const unsigned int MAXDATAN = 512; // maximum for Ndwl and Ndbl
const unsigned int MAXSUBARRAYS = 1048576; // maximum subarrays for data and tag arrays
const unsigned int MAXDATASPD = 256; // maximum for Nspd
const unsigned int MAX_COL_MUX = 256;
#define ROUTER_TYPES 3
#define WIRE_TYPES 6
const double Cpolywire = 0;
/* Threshold voltages (as a proportion of Vdd)
If you don't know them, set all values to 0.5 */
#define VTHFA1 0.452
#define VTHFA2 0.304
#define VTHFA3 0.420
#define VTHFA4 0.413
#define VTHFA5 0.405
#define VTHFA6 0.452
#define VSINV 0.452
#define VTHCOMPINV 0.437
#define VTHMUXNAND 0.548 // TODO : this constant must be revisited
#define VTHEVALINV 0.452
#define VTHSENSEEXTDRV 0.438
//WmuxdrvNANDn and WmuxdrvNANDp are no longer being used but it's part of the old
//delay_comparator function which we are using exactly as it used to be, so just setting these to 0
const double WmuxdrvNANDn = 0;
const double WmuxdrvNANDp = 0;
/*===================================================================*/
/*
* The following are things you probably wouldn't want to change.
*/
#define BIGNUM 1e30
#define INF 9999999
#define MAX(a,b) (((a)>(b))?(a):(b))
#define MIN(a,b) (((a)<(b))?(a):(b))
/* Used to communicate with the horowitz model */
#define RISE 1
#define FALL 0
#define NCH 1
#define PCH 0
#define EPSILON 0.5 //v4.1: This constant is being used in order to fix floating point -> integer
//conversion problems that were occuring within CACTI. Typical problem that was occuring was
//that with different compilers a floating point number like 3.0 would get represented as either
//2.9999....or 3.00000001 and then the integer part of the floating point number (3.0) would
//be computed differently depending on the compiler. What we are doing now is to replace
//int (x) with (int) (x+EPSILON) where EPSILON is 0.5. This would fix such problems. Note that
//this works only when x is an integer >= 0.
/*
* thinks this is more a solution to solve the simple truncate problem
* (http://www.cs.tut.fi/~jkorpela/round.html) rather than the problem mentioned above.
* Unfortunately, this solution causes nasty bugs (different results when using O0 and O3).
* Moreover, round is not correct in CACTI since when an extra fraction of bit/line is needed,
* we need to provide a complete bit/line even the fraction is just 0.01.
* So, in later version than 6.5 we use (int)ceil() to get double to int conversion.
*/
#define EPSILON2 0.1
#define EPSILON3 0.6
#define MINSUBARRAYROWS 16 //For simplicity in modeling, for the row decoding structure, we assume
//that each row predecode block is composed of at least one 2-4 decoder. When the outputs from the
//row predecode blocks are combined this means that there are at least 4*4=16 row decode outputs
#define MAXSUBARRAYROWS 262144 //Each row predecode block produces a max of 2^9 outputs. So
//the maximum number of row decode outputs will be 2^9*2^9
#define MINSUBARRAYCOLS 2
#define MAXSUBARRAYCOLS 262144
#define INV 0
#define NOR 1
#define NAND 2
#define NUMBER_TECH_FLAVORS 4
#define NUMBER_INTERCONNECT_PROJECTION_TYPES 2 //aggressive and conservative
//0 = Aggressive projections, 1 = Conservative projections
#define NUMBER_WIRE_TYPES 4 //local, semi-global and global
//1 = 'Semi-global' wire type, 2 = 'Global' wire type
#define NUMBER_TSV_TYPES 3
//0 = ITRS projected fine TSV type, 1 = Industrial reported large TSV type, 2 = TBD
const int dram_cell_tech_flavor = 3;
#define VBITSENSEMIN 0.08 //minimum bitline sense voltage is fixed to be 80 mV.
#define fopt 4.0
#define INPUT_WIRE_TO_INPUT_GATE_CAP_RATIO 0
#define BUFFER_SEPARATION_LENGTH_MULTIPLIER 1
#define NUMBER_MATS_PER_REDUNDANT_MAT 8
#define NUMBER_STACKED_DIE_LAYERS 1
// this variable can be set to carry out solution optimization for
// a maximum area allocation.
#define STACKED_DIE_LAYER_ALLOTED_AREA_mm2 0 //6.24 //6.21//71.5
// this variable can also be employed when solution optimization
// with maximum area allocation is carried out.
#define MAX_PERCENT_AWAY_FROM_ALLOTED_AREA 50
// this variable can also be employed when solution optimization
// with maximum area allocation is carried out.
#define MIN_AREA_EFFICIENCY 20
// this variable can be employed when solution with a desired
// aspect ratio is required.
#define STACKED_DIE_LAYER_ASPECT_RATIO 1
// this variable can be employed when solution with a desired
// aspect ratio is required.
#define MAX_PERCENT_AWAY_FROM_ASPECT_RATIO 101
// this variable can be employed to carry out solution optimization
// for a certain target random cycle time.
#define TARGET_CYCLE_TIME_ns 1000000000
#define NUMBER_PIPELINE_STAGES 4
// this can be used to model the length of interconnect
// between a bank and a crossbar
#define LENGTH_INTERCONNECT_FROM_BANK_TO_CROSSBAR 0 //3791 // 2880//micron
#define IS_CROSSBAR 0
#define NUMBER_INPUT_PORTS_CROSSBAR 8
#define NUMBER_OUTPUT_PORTS_CROSSBAR 8
#define NUMBER_SIGNALS_PER_PORT_CROSSBAR 256
#define MAT_LEAKAGE_REDUCTION_DUE_TO_SLEEP_TRANSISTORS_FACTOR 1
#define LEAKAGE_REDUCTION_DUE_TO_LONG_CHANNEL_HP_TRANSISTORS_FACTOR 1
#define PAGE_MODE 0
#define MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA 60
// We are actually not using this variable in the CACTI code. We just want to acknowledge that
// this current should be multiplied by the DDR(n) system VDD value to compute the standby power
// consumed during precharge.
const double VDD_STORAGE_LOSS_FRACTION_WORST = 0.125;
const double CU_RESISTIVITY = 0.022; //ohm-micron
const double BULK_CU_RESISTIVITY = 0.018; //ohm-micron
const double PERMITTIVITY_FREE_SPACE = 8.854e-18; //F/micron
const static uint32_t sram_num_cells_wl_stitching_ = 16;
const static uint32_t dram_num_cells_wl_stitching_ = 64;
const static uint32_t comm_dram_num_cells_wl_stitching_ = 256;
const static double num_bits_per_ecc_b_ = 8.0;
const double bit_to_byte = 8.0;
#define MAX_NUMBER_GATES_STAGE 20
#define MAX_NUMBER_HTREE_NODES 20
#define NAND2_LEAK_STACK_FACTOR 0.2
#define NAND3_LEAK_STACK_FACTOR 0.2
#define NOR2_LEAK_STACK_FACTOR 0.2
#define INV_LEAK_STACK_FACTOR 0.5
#define MAX_NUMBER_ARRAY_PARTITIONS 1000000
// abbreviations used in this project
// ----------------------------------
//
// num : number
// rw : read/write
// rd : read
// wr : write
// se : single-ended
// sz : size
// F : feature
// w : width
// h : height or horizontal
// v : vertical or velocity
enum ram_cell_tech_type_num
{
itrs_hp = 0,
itrs_lstp = 1,
itrs_lop = 2,
lp_dram = 3,
comm_dram = 4
};
const double pppm[4] = {1,1,1,1};
const double pppm_lkg[4] = {0,1,1,0};
const double pppm_dyn[4] = {1,0,0,0};
const double pppm_Isub[4] = {0,1,0,0};
const double pppm_Ig[4] = {0,0,1,0};
const double pppm_sc[4] = {0,0,0,1};
const double Ilinear_to_Isat_ratio =2.0;
#endif

View file

@ -0,0 +1,126 @@
l34c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l34c64l2b: 9 11 19 29 43 62 81 102
l34c64l4b: 6 8 12 17 24 29 39 47
l34c64l8b: 7 8 10 14 18 22 25 30
l34c64l16b: 7 7 9 12 14 17 20 24
l34c64l32b: 7 7 9 12 14 17 20 24 -r
l34c64l64b: 7 7 9 12 14 17 20 24 -r
l34c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l34c128l2b: 4 10 19 30 44 64 82 103
l34c128l4b: 3 6 11 17 24 31 38 47
l34c128l8b: 3 5 9 13 17 21 25 29
l34c128l16b: 4 5 7 10 13 16 19 22
l34c128l32b: 4 5 7 10 13 16 19 22 -r
l34c128l64b: 4 5 7 10 13 16 19 22 -r
l34c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l34c256l2b: 3 10 19 30 44 63 82 103
l34c256l4b: 3 6 11 17 24 31 38 47
l34c256l8b: 2 5 8 12 16 20 24 29
l34c256l16b: 2 4 7 9 12 15 18 21
l34c256l32b: 2 4 7 9 12 15 18 21 -r
l34c256l64b: 2 4 7 9 12 15 18 21 -r
l38c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l38c64l2b: 57 59 77 90 137 187 219 245
l38c64l4b: 35 40 48 56 43 61 80 101
l38c64l8b: 18 27 41 45 52 58 58 58 -r
l38c64l16b: 16 17 19 35 40 49 53 53 -r
l38c64l32b: 15 15 17 19 22 25 30 30 -r
l38c64l64b: 15 15 17 19 22 25 30 30 -r
l38c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l38c128l2b: 38 50 78 93 139 188 220 245
l38c128l4b: 29 37 46 56 43 61 81 102
l38c128l8b: 16 30 39 44 50 57 57 57 -r
l38c128l16b: 14 16 19 33 40 47 52 52 -r
l38c128l32b: 14 15 17 20 23 27 31 31 -r
l38c128l64b: 14 15 17 20 23 27 31 31 -r
l38c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l38c256l2b: 35 50 78 94 139 188 220 246
l38c256l4b: 28 36 45 55 55 61 81 102
l38c256l8b: 17 30 38 43 50 57 57 57 -r
l38c256l16b: 15 17 21 32 40 47 51 51
l38c256l32b: 15 17 19 21 24 29 33 33
l38c256l64b: 15 17 19 21 24 29 33 33 -r
l316c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l316c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l316c64l4b: 34 35 78 126 178 220 252 274
l316c64l8b: 9 11 23 43 62 87 105 130
l316c64l16b: 7 9 13 23 33 45 56 67
l316c64l32b: 5 6 7 10 13 19 25 30
l316c64l64b: 4 5 6 8 10 14 18 21
l316c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l316c128l2b: 25 131 243 1000 1000 1000 1000 1000
l316c128l4b: 8 28 79 127 179 221 253 274
l316c128l8b: 4 9 22 43 62 88 106 131
l316c128l16b: 4 6 11 21 32 44 55 67
l316c128l32b: 4 6 11 12 12 18 24 29
l316c128l64b: 2 3 5 7 9 13 17 21
l316c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l316c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l316c256l4b: 5 28 80 128 180 221 253 274
l316c256l8b: 3 8 22 43 63 88 107 131
l316c256l16b: 2 5 11 21 32 44 55 67
l316c256l32b: 2 3 5 8 12 18 24 29
l316c256l64b: 2 3 4 6 9 13 17 21
l24c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l24c64l2b: 10 12 24 41 60 86 105 122
l24c64l4b: 5 7 13 20 29 38 47 56
l24c64l8b: 5 6 9 14 18 24 29 35
l24c64l16b: 4 5 7 10 12 16 19 22
l24c64l32b: 5 5 6 8 10 12 14 17
l24c64l64b: 5 5 6 8 10 12 14 16
l24c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l24c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l24c128l4b: 3 7 13 20 29 38 47 57
l24c128l8b: 3 5 9 13 18 23 29 35
l24c128l16b: 3 4 6 9 12 15 19 22
l24c128l32b: 3 4 5 7 9 11 14 16
l24c128l64b: 1000 1000 1000 1000 1000 1000 1000 1000
l24c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l24c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l24c256l4b: 2 6 13 20 29 38 47 57
l24c256l8b: 2 4 8 13 18 23 28 35
l24c256l16b: 2 3 6 8 11 15 18 22
l24c256l32b: 2 3 5 6 8 11 14 16
l24c256l64b: 1000 1000 1000 1000 1000 1000 1000 1000
l28c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l28c64l2b: 46 52 117 157 188 225 246 261
l28c64l4b: 19 25 39 54 96 107 120 150
l28c64l8b: 9 12 21 30 39 47 58 79
l28c64l16b: 8 9 11 16 25 32 37 42
l28c64l32b: 7 8 9 11 14 19 23 28
l28c64l64b: 7 7 8 10 12 14 18 22
l28c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l28c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l28c128l4b: 12 22 39 54 98 108 130 151
l28c128l8b: 7 12 21 30 39 48 59 80
l28c128l16b: 6 8 11 16 24 31 37 42
l28c128l32b: 6 7 9 11 14 19 24 28
l28c128l64b: 6 7 9 11 14 19 24 28
l28c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l28c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l28c256l4b: 12 22 39 54 100 108 130 152
l28c256l8b: 7 12 21 30 39 48 59 81
l28c256l16b: 6 8 11 16 24 31 37 42
l28c256l32b: 6 7 9 11 14 19 24 28
l28c256l64b: 6 7 9 11 14 19 24 28
l216c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l216c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l216c64l4b: 34 35 78 126 178 220 252 274
l216c64l8b: 9 11 23 43 62 87 105 130
l216c64l16b: 7 9 13 23 33 45 56 67
l216c64l32b: 5 6 7 10 13 19 25 30
l216c64l64b: 4 5 6 8 10 14 18 21
l216c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l216c128l2b: 25 131 243 1000 1000 1000 1000 1000
l216c128l4b: 8 28 79 127 179 221 253 274
l216c128l8b: 4 9 22 43 62 88 106 131
l216c128l16b: 4 6 11 21 32 44 55 67
l216c128l32b: 4 6 11 12 12 18 24 29
l216c128l64b: 2 3 5 7 9 13 17 21
l216c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
l216c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
l216c256l4b: 5 28 80 128 180 221 253 274
l216c256l8b: 3 8 22 43 63 88 107 131
l216c256l16b: 2 5 11 21 32 44 55 67
l216c256l32b: 2 3 5 8 12 18 24 29
l216c256l64b: 2 3 4 6 9 13 17 21

View file

@ -0,0 +1,161 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "crossbar.h"
#define ASPECT_THRESHOLD .8
#define ADJ 1
Crossbar::Crossbar(
double n_inp_,
double n_out_,
double flit_size_,
/*TechnologyParameter::*/DeviceType *dt
):n_inp(n_inp_), n_out(n_out_), flit_size(flit_size_), deviceType(dt)
{
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
Vdd = dt->Vdd;
CB_ADJ = 1;
}
Crossbar::~Crossbar(){}
double Crossbar::output_buffer()
{
//Wire winit(4, 4);
double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
Wire w1(g_ip->wt, l_eff);
//double s1 = w1.repeater_size *l_eff*ADJ/w1.repeater_spacing;
double s1 = w1.repeater_size * (l_eff <w1.repeater_spacing? l_eff *ADJ/w1.repeater_spacing : ADJ);
double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
// the model assumes input capacitance of the wire driver = input capacitance of nand + nor = input cap of the driver transistor
TriS1 = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
TriS2 = s1; //driver transistor
if (TriS1 < 1)
TriS1 = 1;
double input_cap = gate_C(TriS1*(2*min_w_pmos + g_tp.min_w_nmos_), 0) +
gate_C(TriS1*(min_w_pmos + 2*g_tp.min_w_nmos_), 0);
// input_cap += drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
// drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
// gate_C(TriS2*g_tp.min_w_nmos_, 0)+
// drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
// drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
// gate_C(TriS2*min_w_pmos, 0);
tri_int_cap = drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
gate_C(TriS2*g_tp.min_w_nmos_, 0)+
drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
gate_C(TriS2*min_w_pmos, 0);
double output_cap = drain_C_(TriS2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(TriS2*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def);
double ctr_cap = gate_C(TriS2 *(min_w_pmos + g_tp.min_w_nmos_), 0);
tri_inp_cap = input_cap;
tri_out_cap = output_cap;
tri_ctr_cap = ctr_cap;
return input_cap + output_cap + ctr_cap;
}
void Crossbar::compute_power()
{
Wire winit(4, 4);
double tri_cap = output_buffer();
assert(tri_cap > 0);
//area of a tristate logic
double g_area = compute_gate_area(INV, 1, TriS2*g_tp.min_w_nmos_, TriS2*min_w_pmos, g_tp.cell_h_def);
g_area *= 2; // to model area of output transistors
g_area += compute_gate_area (NAND, 2, TriS1*2*g_tp.min_w_nmos_, TriS1*min_w_pmos, g_tp.cell_h_def);
g_area += compute_gate_area (NOR, 2, TriS1*g_tp.min_w_nmos_, TriS1*2*min_w_pmos, g_tp.cell_h_def);
double width /*per tristate*/ = g_area/(CB_ADJ * g_tp.cell_h_def);
// effective no. of tristate buffers that need to be laid side by side
int ntri = (int)ceil(g_tp.cell_h_def/(g_tp.wire_outside_mat.pitch));
double wire_len = MAX(width*ntri*n_out, flit_size*g_tp.wire_outside_mat.pitch*n_out);
Wire w1(g_ip->wt, wire_len);
area.w = wire_len;
area.h = g_tp.wire_outside_mat.pitch*n_inp*flit_size * CB_ADJ;
Wire w2(g_ip->wt, area.h);
double aspect_ratio_cb = (area.h/area.w)*(n_out/n_inp);
if (aspect_ratio_cb > 1) aspect_ratio_cb = 1/aspect_ratio_cb;
if (aspect_ratio_cb < ASPECT_THRESHOLD) {
if (n_out > 2 && n_inp > 2) {
CB_ADJ+=0.2;
//cout << "CB ADJ " << CB_ADJ << endl;
if (CB_ADJ < 4) {
this->compute_power();
}
}
}
power.readOp.dynamic = (w1.power.readOp.dynamic + w2.power.readOp.dynamic + (tri_inp_cap * n_out + tri_out_cap * n_inp + tri_ctr_cap + tri_int_cap) * Vdd*Vdd)*flit_size;
power.readOp.leakage = n_inp * n_out * flit_size * (
cmos_Isub_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
w1.power.readOp.leakage + w2.power.readOp.leakage);
power.readOp.gate_leakage = n_inp * n_out * flit_size * (
cmos_Ig_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
w1.power.readOp.gate_leakage + w2.power.readOp.gate_leakage);
// delay calculation
double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
Wire wdriver(g_ip->wt, l_eff);
double res = g_tp.wire_outside_mat.R_per_um * (area.w+area.h) + tr_R_on(g_tp.min_w_nmos_*wdriver.repeater_size, NCH, 1);
double cap = g_tp.wire_outside_mat.C_per_um * (area.w + area.h) + n_out*tri_inp_cap + n_inp*tri_out_cap;
delay = horowitz(w1.signal_rise_time(), res*cap, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
Wire wreset();
}
void Crossbar::print_crossbar()
{
cout << "\nCrossbar Stats (" << n_inp << "x" << n_out << ")\n\n";
cout << "Flit size : " << flit_size << " bits" << endl;
cout << "Width : " << area.w << " u" << endl;
cout << "Height : " << area.h << " u" << endl;
cout << "Dynamic Power : " << power.readOp.dynamic*1e9 * MIN(n_inp, n_out) << " (nJ)" << endl;
cout << "Leakage Power : " << power.readOp.leakage*1e3 << " (mW)" << endl;
cout << "Gate Leakage Power : " << power.readOp.gate_leakage*1e3 << " (mW)" << endl;
cout << "Crossbar Delay : " << delay*1e12 << " ps\n";
}

View file

@ -0,0 +1,83 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __CROSSBAR__
#define __CROSSBAR__
#include <assert.h>
#include <iostream>
#include "basic_circuit.h"
#include "cacti_interface.h"
#include "component.h"
#include "parameter.h"
#include "mat.h"
#include "wire.h"
class Crossbar : public Component
{
public:
Crossbar(
double in,
double out,
double flit_sz,
/*TechnologyParameter::*/DeviceType *dt = &(g_tp.peri_global));
~Crossbar();
void print_crossbar();
double output_buffer();
void compute_power();
double n_inp, n_out;
double flit_size;
double tri_inp_cap, tri_out_cap, tri_ctr_cap, tri_int_cap;
private:
double CB_ADJ;
/*
* Adjust factor of the height of the cross-point (tri-state buffer) cell (layout) in crossbar
* buffer is adjusted to get an aspect ratio of whole cross bar close to one;
* when adjust the ratio, the number of wires route over the tri-state buffers does not change,
* however, the effective wiring pitch changes. Specifically, since CB_ADJ will increase
* during the adjust, the tri-state buffer will become taller and thiner, and the effective wiring pitch
* will increase. As a result, the height of the crossbar (area.h) will increase.
*/
/*TechnologyParameter::*/DeviceType *deviceType;
double TriS1, TriS2;
double min_w_pmos, Vdd;
};
#endif

254
T1/TP/TP1/cacti_7/ddr3.cfg Normal file
View file

@ -0,0 +1,254 @@
# Cache size
//-size (bytes) 2048
//-size (bytes) 4096
//-size (bytes) 32768
//-size (bytes) 131072
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
# power gating
-Array Power Gating - "false"
-WL Power Gating - "false"
-CL Power Gating - "false"
-Bitline floating - "false"
-Interconnect Power Gating - "false"
-Power Gating Performance Loss 0.01
# Line size
//-block size (bytes) 8
-block size (bytes) 64
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
//-associativity 8
-associativity 8
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 1
-technology (u) 0.022
//-technology (u) 0.040
//-technology (u) 0.032
//-technology (u) 0.090
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
//-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
//-Tag array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
//-Tag array peripheral type - "itrs-lop
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
-output/input bus width 512
// 300-400 in steps of 10
-operating temperature (K) 360
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 22
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
//-Wire outside mat - "global"
-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
#### Default CONFIGURATION values for baseline external IO parameters to DRAM. More details can be found in the CACTI-IO technical report (), especially Chapters 2 and 3.
# Memory Type (D=DDR3, L=LPDDR2, W=WideIO). Additional memory types can be defined by the user in extio_technology.cc, along with their technology and configuration parameters.
-dram_type "D"
//-dram_type "L"
//-dram_type "W"
//-dram_type "S"
# Memory State (R=Read, W=Write, I=Idle or S=Sleep)
//-iostate "R"
-iostate "W"
//-iostate "I"
//-iostate "S"
#Address bus timing. To alleviate the timing on the command and address bus due to high loading (shared across all memories on the channel), the interface allows for multi-cycle timing options.
-addr_timing 0.5 //DDR
//-addr_timing 1.0 //SDR (half of DQ rate)
//-addr_timing 2.0 //2T timing (One fourth of DQ rate)
//-addr_timing 3.0 // 3T timing (One sixth of DQ rate)
# Memory Density (Gbit per memory/DRAM die)
-mem_density 8 Gb //Valid values 2^n Gb
# IO frequency (MHz) (frequency of the external memory interface).
-bus_freq 800 MHz //As of current memory standards (2013), valid range 0 to 1.5 GHz for DDR3, 0 to 533 MHz for LPDDR2, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential. However this can change, and the user is free to define valid ranges based on new memory types or extending beyond existing standards for existing dram types.
# Duty Cycle (fraction of time in the Memory State defined above)
-duty_cycle 1.0 //Valid range 0 to 1.0
# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
#-activity_dq .50 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_ca 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR, 0 to 0.25 for 2T, and 0 to 0.17 for 3T
#-activity_ca 0.25 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR, 0 to 0.25 for 2T, and 0 to 0.17 for 3T
# Number of DQ pins
-num_dq 72 //Number of DQ pins. Includes ECC pins.
# Number of DQS pins. DQS is a data strobe that is sent along with a small number of data-lanes so the source synchronous timing is local to these DQ bits. Typically, 1 DQS per byte (8 DQ bits) is used. The DQS is also typucally differential, just like the CLK pin.
-num_dqs 36 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.
# Number of CA pins
-num_ca 35 //Valid range 0 to 35 pins.
#-num_ca 25 //Valid range 0 to 35 pins.
# Number of CLK pins. CLK is typically a differential pair. In some cases additional CLK pairs may be used to limit the loading on the CLK pin.
-num_clk 2 //2 x differential pair. Valid values: 0/2/4.
# Number of Physical Ranks
-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per buffer/register. If multiple LRDIMMs or buffer chips exist, the analysis for capacity and power is reported per buffer/register.
# Width of the Memory Data Bus
-mem_data_width 4 //x4 or x8 or x16 or x32 memories. For WideIO upto x128.

1673
T1/TP/TP1/cacti_7/decoder.cc Normal file

File diff suppressed because it is too large Load diff

272
T1/TP/TP1/cacti_7/decoder.h Normal file
View file

@ -0,0 +1,272 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __DECODER_H__
#define __DECODER_H__
#include "area.h"
#include "component.h"
#include "parameter.h"
#include "powergating.h"
#include <vector>
using namespace std;
class Decoder : public Component
{
public:
Decoder(
int _num_dec_signals,
bool flag_way_select,
double _C_ld_dec_out,
double _R_wire_dec_out,
bool fully_assoc_,
bool is_dram_,
bool is_wl_tr_,
const Area & cell_);
bool exist;
int num_in_signals;
double C_ld_dec_out;
double R_wire_dec_out;
int num_gates;
int num_gates_min;
double w_dec_n[MAX_NUMBER_GATES_STAGE];
double w_dec_p[MAX_NUMBER_GATES_STAGE];
double delay;
//powerDef power;
bool fully_assoc;
bool is_dram;
bool is_wl_tr;
double total_driver_nwidth;
double total_driver_pwidth;
Sleep_tx * sleeptx;
const Area & cell;
int nodes_DSTN;
void compute_widths();
void compute_area();
double compute_delays(double inrisetime); // return outrisetime
void compute_power_gating();
void leakage_feedback(double temperature);
~Decoder()
{
if (!sleeptx)
delete sleeptx;
};
};
class PredecBlk : public Component
{
public:
PredecBlk(
int num_dec_signals,
Decoder * dec,
double C_wire_predec_blk_out,
double R_wire_predec_blk_out,
int num_dec_per_predec,
bool is_dram_,
bool is_blk1);
Decoder * dec;
bool exist;
int number_input_addr_bits;
double C_ld_predec_blk_out;
double R_wire_predec_blk_out;
int branch_effort_nand2_gate_output;
int branch_effort_nand3_gate_output;
bool flag_two_unique_paths;
int flag_L2_gate;
int number_inputs_L1_gate;
int number_gates_L1_nand2_path;
int number_gates_L1_nand3_path;
int number_gates_L2;
int min_number_gates_L1;
int min_number_gates_L2;
int num_L1_active_nand2_path;
int num_L1_active_nand3_path;
double w_L1_nand2_n[MAX_NUMBER_GATES_STAGE];
double w_L1_nand2_p[MAX_NUMBER_GATES_STAGE];
double w_L1_nand3_n[MAX_NUMBER_GATES_STAGE];
double w_L1_nand3_p[MAX_NUMBER_GATES_STAGE];
double w_L2_n[MAX_NUMBER_GATES_STAGE];
double w_L2_p[MAX_NUMBER_GATES_STAGE];
double delay_nand2_path;
double delay_nand3_path;
powerDef power_nand2_path;
powerDef power_nand3_path;
powerDef power_L2;
bool is_dram_;
void compute_widths();
void compute_area();
void leakage_feedback(double temperature);
pair<double, double> compute_delays(pair<double, double> inrisetime); // <nand2, nand3>
// return <outrise_nand2, outrise_nand3>
};
class PredecBlkDrv : public Component
{
public:
PredecBlkDrv(
int way_select,
PredecBlk * blk_,
bool is_dram);
int flag_driver_exists;
int number_input_addr_bits;
int number_gates_nand2_path;
int number_gates_nand3_path;
int min_number_gates;
int num_buffers_driving_1_nand2_load;
int num_buffers_driving_2_nand2_load;
int num_buffers_driving_4_nand2_load;
int num_buffers_driving_2_nand3_load;
int num_buffers_driving_8_nand3_load;
int num_buffers_nand3_path;
double c_load_nand2_path_out;
double c_load_nand3_path_out;
double r_load_nand2_path_out;
double r_load_nand3_path_out;
double width_nand2_path_n[MAX_NUMBER_GATES_STAGE];
double width_nand2_path_p[MAX_NUMBER_GATES_STAGE];
double width_nand3_path_n[MAX_NUMBER_GATES_STAGE];
double width_nand3_path_p[MAX_NUMBER_GATES_STAGE];
double delay_nand2_path;
double delay_nand3_path;
powerDef power_nand2_path;
powerDef power_nand3_path;
PredecBlk * blk;
Decoder * dec;
bool is_dram_;
int way_select;
void compute_widths();
void compute_area();
void leakage_feedback(double temperature);
pair<double, double> compute_delays(
double inrisetime_nand2_path,
double inrisetime_nand3_path); // return <outrise_nand2, outrise_nand3>
inline int num_addr_bits_nand2_path()
{
return num_buffers_driving_1_nand2_load +
num_buffers_driving_2_nand2_load +
num_buffers_driving_4_nand2_load;
}
inline int num_addr_bits_nand3_path()
{
return num_buffers_driving_2_nand3_load +
num_buffers_driving_8_nand3_load;
}
double get_rdOp_dynamic_E(int num_act_mats_hor_dir);
};
class Predec : public Component
{
public:
Predec(
PredecBlkDrv * drv1,
PredecBlkDrv * drv2);
double compute_delays(double inrisetime); // return outrisetime
void leakage_feedback(double temperature);
PredecBlk * blk1;
PredecBlk * blk2;
PredecBlkDrv * drv1;
PredecBlkDrv * drv2;
powerDef block_power;
powerDef driver_power;
private:
// returns <delay, risetime>
pair<double, double> get_max_delay_before_decoder(
pair<double, double> input_pair1,
pair<double, double> input_pair2);
};
class Driver : public Component
{
public:
Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram);
int number_gates;
int min_number_gates;
double width_n[MAX_NUMBER_GATES_STAGE];
double width_p[MAX_NUMBER_GATES_STAGE];
double c_gate_load;
double c_wire_load;
double r_wire_load;
double delay;
// powerDef power;
bool is_dram_;
double total_driver_nwidth;
double total_driver_pwidth;
Sleep_tx * sleeptx;
void compute_widths();
void compute_area();
double compute_delay(double inrisetime);
void compute_power_gating();
~Driver()
{
if (!sleeptx)
delete sleeptx;
};
};
#endif

114
T1/TP/TP1/cacti_7/dram.cfg Normal file
View file

@ -0,0 +1,114 @@
//-size (bytes) 16777216
//-size (bytes) 33554432
-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
-block size (bytes) 64
-associativity 1
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
-UCA bank count 1
//-technology (u) 0.032
//-technology (u) 0.045
-technology (u) 0.068
//-technology (u) 0.078
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of the five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "comm-dram"
# following parameter can have one of the three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
# following parameter can have one of the five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
# following parameter can have one of the three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 512
-output/input bus width 64
-operating temperature (K) 350
-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 45
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
//-design objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:0
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:1000000
//-deviate (delay, dynamic power, leakage power, cycle time, area) 200:100000:100000:100000:20
-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Wire signalling (fullswing, lowswing, default) - "default"
-Wire signalling (fullswing, lowswing, default) - "Global_10"
-Wire inside mat - "global"
//-Wire inside mat - "semi-global"
-Wire outside mat - "global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
-Add ECC - "true"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
-Print input parameters - "true"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
########### NUCA Params ############
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0

234
T1/TP/TP1/cacti_7/etude.txt Normal file
View file

@ -0,0 +1,234 @@
Cache size : 131072
Block size : 64
Associativity : 2
Read only ports : 0
Write only ports : 0
Read write ports : 1
Single ended read ports : 0
Cache banks (UCA) : 1
Technology : 0.09
Temperature : 360
Tag size : 42
array type : Cache
Model as memory : 0
Model as 3D memory : 0
Access mode : 0
Data array cell type : 0
Data array peripheral type : 0
Tag array cell type : 0
Tag array peripheral type : 0
Optimization target : 2
Design objective (UCA wt) : 0 0 0 100 0
Design objective (UCA dev) : 20 100000 100000 100000 100000
Cache model : 0
Nuca bank : 0
Wire inside mat : 1
Wire outside mat : 1
Interconnect projection : 1
Wire signaling : 1
Print level : 1
ECC overhead : 1
Page size : 8192
Burst length : 8
Internal prefetch width : 8
Force cache config : 0
Subarray Driver direction : 1
iostate : WRITE
dram_ecc : NO_ECC
io_type : DDR3
dram_dimm : UDIMM
IO Area (sq.mm) = inf
IO Timing Margin (ps) = -14.1667
IO Votlage Margin (V) = 0.155
IO Dynamic Power (mW) = 1506.36 PHY Power (mW) = 232.752 PHY Wakeup Time (us) = 27.503
IO Termination and Bias Power (mW) = 2505.96
---------- CACTI (version 7.0.3DD Prerelease of Aug, 2012), Uniform Cache Access SRAM Model ----------
Cache Parameters:
Total cache size (bytes): 131072
Number of banks: 1
Associativity: 2
Block size (bytes): 64
Read/write Ports: 1
Read ports: 0
Write ports: 0
Technology size (nm): 90
Access time (ns): 1.47098
Cycle time (ns): 1.86851
Total dynamic read energy per access (nJ): 0.303592
Total dynamic write energy per access (nJ): 0.615022
Total leakage power of a bank (mW): 59.1454
Total gate leakage power of a bank (mW): 4.55691
Cache height x width (mm): 1.57965 x 1.42405
Best Ndwl : 2
Best Ndbl : 2
Best Nspd : 1
Best Ndcm : 2
Best Ndsam L1 : 2
Best Ndsam L2 : 1
Best Ntwl : 2
Best Ntbl : 2
Best Ntspd : 4
Best Ntcm : 1
Best Ntsam L1 : 8
Best Ntsam L2 : 1
Data array, H-tree wire type: Global wires with 30% delay penalty
Tag array, H-tree wire type: Global wires with 30% delay penalty
Time Components:
Data side (with Output driver) (ns): 1.47098
H-tree input delay (ns): 0
Decoder + wordline delay (ns): 0.752867
Bitline delay (ns): 0.546781
Sense Amplifier delay (ns): 0.0107354
H-tree output delay (ns): 0.160596
Tag side (with Output driver) (ns): 0.71334
H-tree input delay (ns): 0
Decoder + wordline delay (ns): 0.466679
Bitline delay (ns): 0.147706
Sense Amplifier delay (ns): 0.0107949
Comparator delay (ns): 0.131234
H-tree output delay (ns): 0.08816
Power Components:
Data array: Total dynamic read energy/access (nJ): 0.286158
Total energy in H-tree (that includes both address and data transfer) (nJ): 0
Output Htree inside bank Energy (nJ): 0
Decoder (nJ): 0.00164907
Wordline (nJ): 0.00212735
Bitline mux & associated drivers (nJ): 0.00335251
Sense amp mux & associated drivers (nJ): 0
Bitlines precharge and equalization circuit (nJ): 0.0161369
Bitlines (nJ): 0.116857
Sense amplifier energy (nJ): 0.00726078
Sub-array output driver (nJ): 0.137516
Total leakage power of a bank (mW): 55.1285
Total leakage power in H-tree (that includes both address and data network) ((mW)): 0
Total leakage power in cells (mW): 0
Total leakage power in row logic(mW): 0
Total leakage power in column logic(mW): 0
Total gate leakage power in H-tree (that includes both address and data network) ((mW)): 0
Tag array: Total dynamic read energy/access (nJ): 0.0174337
Total leakage read/write power of a bank (mW): 4.01688
Total energy in H-tree (that includes both address and data transfer) (nJ): 0
Output Htree inside a bank Energy (nJ): 0
Decoder (nJ): 0.000340468
Wordline (nJ): 0.000710492
Bitline mux & associated drivers (nJ): 0
Sense amp mux & associated drivers (nJ): 0.000330669
Bitlines precharge and equalization circuit (nJ): 0.00425803
Bitlines (nJ): 0.00759182
Sense amplifier energy (nJ): 0.00354912
Sub-array output driver (nJ): 0.000194898
Total leakage power of a bank (mW): 4.01688
Total leakage power in H-tree (that includes both address and data network) ((mW)): 0
Total leakage power in cells (mW): 0
Total leakage power in row logic(mW): 0
Total leakage power in column logic(mW): 0
Total gate leakage power in H-tree (that includes both address and data network) ((mW)): 0
Area Components:
Data array: Area (mm2): 1.78124
Height (mm): 1.57965
Width (mm): 1.12762
Area efficiency (Memory cell area/Total area) - 78.3192 %
MAT Height (mm): 1.57965
MAT Length (mm): 1.12762
Subarray Height (mm): 0.672768
Subarray Length (mm): 0.5427
Tag array: Area (mm2): 0.108777
Height (mm): 0.366956
Width (mm): 0.296431
Area efficiency (Memory cell area/Total area) - 77.9289 %
MAT Height (mm): 0.366956
MAT Length (mm): 0.296431
Subarray Height (mm): 0.168192
Subarray Length (mm): 0.1314
Wire Properties:
Delay Optimal
Repeater size - 61.5792
Repeater spacing - 0.321831 (mm)
Delay - 0.137938 (ns/mm)
PowerD - 0.000766371 (nJ/mm)
PowerL - 0.00525075 (mW/mm)
PowerLgate - 0.000882254 (mW/mm)
Wire width - 0.09 microns
Wire spacing - 0.09 microns
5% Overhead
Repeater size - 34.5792
Repeater spacing - 0.421831 (mm)
Delay - 0.144333 (ns/mm)
PowerD - 0.000519963 (nJ/mm)
PowerL - 0.00224953 (mW/mm)
PowerLgate - 0.000377976 (mW/mm)
Wire width - 0.09 microns
Wire spacing - 0.09 microns
10% Overhead
Repeater size - 32.5792
Repeater spacing - 0.521831 (mm)
Delay - 0.151515 (ns/mm)
PowerD - 0.000485471 (nJ/mm)
PowerL - 0.00171327 (mW/mm)
PowerLgate - 0.000287871 (mW/mm)
Wire width - 0.09 microns
Wire spacing - 0.09 microns
20% Overhead
Repeater size - 27.5792
Repeater spacing - 0.621831 (mm)
Delay - 0.164984 (ns/mm)
PowerD - 0.000447956 (nJ/mm)
PowerL - 0.00121709 (mW/mm)
PowerLgate - 0.000204502 (mW/mm)
Wire width - 0.09 microns
Wire spacing - 0.09 microns
30% Overhead
Repeater size - 21.5792
Repeater spacing - 0.621831 (mm)
Delay - 0.179014 (ns/mm)
PowerD - 0.000419905 (nJ/mm)
PowerL - 0.000952309 (mW/mm)
PowerLgate - 0.000160011 (mW/mm)
Wire width - 0.09 microns
Wire spacing - 0.09 microns
Low-swing wire (1 mm) - Note: Unlike repeated wires,
delay and power values of low-swing wires do not
have a linear relationship with length.
delay - 0.611231 (ns)
powerD - 2.52036e-05 (nJ)
PowerL - 2.71875e-07 (mW)
PowerLgate - 8.41995e-08 (mW)
Wire width - 1.8e-07 microns
Wire spacing - 1.8e-07 microns
top 3 best memory configurations are:
Memory cap: 80 GB num_bobs: 1 bw: 533 (MHz) cost: $731.2 energy: 32.6101 (nJ)
{
(0) BoB cap: 80 GB num_channels: 1 bw: 533 (MHz) cost: $731.2 energy: 32.6101 (nJ)
==============
(0) cap: 80 GB bw: 533 (MHz) cost: $731.2 dpc: 3 energy: 32.6101 (nJ) DIMM: RDIMM low power: F [ 0(4GB) 0(8GB) 1(16GB) 2(32GB) 0(64GB) ]
==============
}
=============================================

506
T1/TP/TP1/cacti_7/extio.cc Normal file
View file

@ -0,0 +1,506 @@
#include "extio.h"
#include <cassert>
Extio::Extio(IOTechParam *iot):
io_param(iot){}
//External IO AREA. Does not include PHY or decap, includes only IO active circuit. More details can be found in the CACTI-IO technical report (), Chapter 2.3.
void Extio::extio_area()
{
//Area per IO, assuming drive stage and ODT are shared
double single_io_area = io_param->ioarea_c +
(io_param->ioarea_k0/io_param->r_on)+(1/io_param->r_on)*
(io_param->ioarea_k1*io_param->frequency +
io_param->ioarea_k2*io_param->frequency*io_param->frequency +
io_param->ioarea_k3*io_param->frequency*
io_param->frequency*io_param->frequency); // IO Area in sq.mm.
//Area per IO if ODT requirements are more stringent than the Ron
//requirements in determining size of driver
if (2*io_param->rtt1_dq_read < io_param->r_on) {
single_io_area = io_param->ioarea_c +
(io_param->ioarea_k0/(2*io_param->rtt1_dq_read))+
(1/io_param->r_on)*(io_param->ioarea_k1*io_param->frequency +
io_param->ioarea_k2*io_param->frequency*io_param->frequency +
io_param->ioarea_k3*io_param->frequency*io_param->frequency*io_param->frequency);
}
//Total IO area
io_area = (g_ip->num_dq + g_ip->num_dqs + g_ip->num_ca + g_ip->num_clk) *
single_io_area;
printf("IO Area (sq.mm) = ");
cout << io_area << endl;
}
//External IO Termination Power. More details can be found in the CACTI-IO technical report (), Chapter 2.1.
void Extio::extio_power_term()
{
//IO Termination and Bias Power
//Bias and Leakage Power
power_bias = io_param->i_bias * io_param->vdd_io +
io_param->i_leak * (g_ip->num_dq +
g_ip->num_dqs +
g_ip->num_clk +
g_ip->num_ca) * io_param->vdd_io/1000000;
//Termination Power
power_termination_read = 1000 * (g_ip->num_dq + g_ip->num_dqs) *
io_param->vdd_io * io_param->vdd_io * 0.25 *
(1/(io_param->r_on + io_param->rpar_read + io_param->rs1_dq) +
1/(io_param->rtt1_dq_read) + 1/(io_param->rtt2_dq_read)) +
1000 * g_ip->num_ca * io_param->vdd_io * io_param->vdd_io *
(0.5 / (2 * (io_param->r_on_ca + io_param->rtt_ca)));
power_termination_write = 1000 * (g_ip->num_dq + g_ip->num_dqs) *
io_param->vdd_io * io_param->vdd_io * 0.25 *
(1/(io_param->r_on + io_param->rpar_write) +
1/(io_param->rtt1_dq_write) + 1/(io_param->rtt2_dq_write)) +
1000 * g_ip->num_ca * io_param->vdd_io * io_param->vdd_io *
(0.5 / (2 * (io_param->r_on_ca + io_param->rtt_ca)));
power_clk_bias = io_param->vdd_io * io_param->v_sw_clk / io_param->r_diff_term * 1000;
if (io_param->io_type == Serial)
{ power_termination_read= 1000*(g_ip->num_dq)*io_param->vdd_io*io_param->v_sw_clk/io_param->r_diff_term;
power_termination_write= 1000*(g_ip->num_dq)*io_param->vdd_io*io_param->v_sw_clk/io_param->r_diff_term;
power_clk_bias=0;
}
if (io_param->io_type == DDR4)
{
power_termination_read=1000 * (g_ip->num_dq + g_ip->num_dqs) *
io_param->vdd_io * io_param->vdd_io *0.5 * (1/(io_param->r_on + io_param->rpar_read + io_param->rs1_dq))
+ 1000 * g_ip->num_ca * io_param->vdd_io * io_param->vdd_io *
(0.5 / (2 * (io_param->r_on_ca + io_param->rtt_ca)));
power_termination_write = 1000 * (g_ip->num_dq + g_ip->num_dqs) *
io_param->vdd_io * io_param->vdd_io * 0.5 *
(1/(io_param->r_on + io_param->rpar_write)) +
1000 * g_ip->num_ca * io_param->vdd_io * io_param->vdd_io *
(0.5 / (2 * (io_param->r_on_ca + io_param->rtt_ca)));
}
//Combining the power terms based on STATE (READ/WRITE/IDLE/SLEEP)
if (g_ip->iostate == READ)
{
io_power_term = g_ip->duty_cycle *
(power_termination_read + power_bias + power_clk_bias);
}
else if (g_ip->iostate == WRITE)
{
io_power_term = g_ip->duty_cycle *
(power_termination_write + power_bias + power_clk_bias);
}
else if (g_ip->iostate == IDLE)
{
io_power_term = g_ip->duty_cycle *
(power_termination_write + power_bias + power_clk_bias);
if (io_param->io_type == DDR4)
{ io_power_term = 1e-6*io_param->i_leak*io_param->vdd_io; // IDLE IO power for DDR4 is leakage since bus can be parked at VDDQ
}
}
else if (g_ip->iostate == SLEEP)
{
io_power_term = 1e-6*io_param->i_leak*io_param->vdd_io; //nA to mW
}
else
{
io_power_term = 0;
}
printf("IO Termination and Bias Power (mW) = ");
cout << io_power_term << endl;
}
//External PHY Power and Wakeup Times. More details can be found in the CACTI-IO technical report (), Chapter 2.1.
void Extio::extio_power_phy ()
{
phy_static_power = io_param->phy_datapath_s + io_param->phy_phase_rotator_s +
io_param->phy_clock_tree_s + io_param->phy_rx_s + io_param->phy_dcc_s +
io_param->phy_deskew_s + io_param->phy_leveling_s + io_param->phy_pll_s; // in mW
phy_dynamic_power = io_param->phy_datapath_d + io_param->phy_phase_rotator_d +
io_param->phy_clock_tree_d + io_param->phy_rx_d + io_param->phy_dcc_d +
io_param->phy_deskew_d + io_param->phy_leveling_d +
io_param->phy_pll_d; // in mW/Gbps
//Combining the power terms based on STATE (READ/WRITE/IDLE/SLEEP)
if (g_ip->iostate == READ)
{
phy_power = phy_static_power + 2 * io_param->frequency * g_ip->num_dq * phy_dynamic_power / 1000; // Total PHY power in mW
}
else if (g_ip->iostate == WRITE)
{
phy_power = phy_static_power + 2 * io_param->frequency * g_ip->num_dq * phy_dynamic_power / 1000; // Total PHY power in mW
}
else if (g_ip->iostate == IDLE)
{
phy_power = phy_static_power; // Total PHY power in mW
}
else if (g_ip->iostate == SLEEP)
{
phy_power = 0; // Total PHY power in mW;
}
else
{
phy_power = 0; // Total PHY power in mW;
}
phy_wtime = io_param->phy_pll_wtime + io_param->phy_phase_rotator_wtime + io_param->phy_rx_wtime + io_param->phy_bandgap_wtime + io_param->phy_deskew_wtime + io_param->phy_vrefgen_wtime; // Total Wakeup time from SLEEP to ACTIVE. Some of the Wakeup time can be hidden if all components do not need to be serially brought out of SLEEP. This depends on the implementation and user can modify the Wakeup times accordingly.
printf("PHY Power (mW) = ");
cout << phy_power << " ";
printf("PHY Wakeup Time (us) = ");
cout << phy_wtime << endl;
}
//External IO Dynamic Power. Does not include termination or PHY. More details can be found in the CACTI-IO technical report (), Chapter 2.1.
void Extio::extio_power_dynamic()
{
if (io_param->io_type == Serial)
{
power_dq_write = 0;
power_dqs_write = 0;
power_ca_write = 0;
power_dq_read = 0;
power_dqs_read = 0;
power_ca_read = 0;
power_clk = 0;
}
else
{
//Line capacitance calculations for effective c_line
double c_line =1e6/(io_param->z0*2*io_param->frequency); //For DDR signals: DQ, DQS, CLK
double c_line_ca=c_line; //For DDR CA
double c_line_sdr=1e6/(io_param->z0*io_param->frequency); //For SDR CA
double c_line_2T=1e6*2/(io_param->z0*io_param->frequency); //For 2T timing
double c_line_3T=1e6*3/(io_param->z0*io_param->frequency); //For 3T timing
//Line capacitance if flight time is less than half the bit period
if (io_param->t_flight < 1e3/(4*io_param->frequency)){
c_line = 1e3*io_param->t_flight/io_param->z0;
}
if (io_param->t_flight_ca < 1e3/(4*io_param->frequency)){
c_line_ca = 1e3*io_param->t_flight/io_param->z0;
}
if (io_param->t_flight_ca < 1e3/(2*io_param->frequency)){
c_line_sdr = 1e3*io_param->t_flight/io_param->z0;
}
if (io_param->t_flight_ca < 1e3*2/(2*io_param->frequency)){
c_line_2T = 1e3*io_param->t_flight/io_param->z0;
}
if (io_param->t_flight_ca < 1e3*3/(2*io_param->frequency)){
c_line_3T = 1e3*io_param->t_flight/io_param->z0;
}
//Line capacitance calculation for the address bus, depending on what address timing is chosen (DDR/SDR/2T/3T)
if (g_ip->addr_timing==1.0) {
c_line_ca = c_line_sdr;
}
else if (g_ip->addr_timing==2.0){
c_line_ca = c_line_2T;
}
else if (g_ip->addr_timing==3.0){
c_line_ca = c_line_3T;
}
//Dynamic power per signal group for WRITE and READ modes
power_dq_write = g_ip->num_dq * g_ip->activity_dq *
(io_param->c_tx + c_line) * io_param->vdd_io *
io_param->v_sw_data_write_line * io_param->frequency / 1000 +
g_ip->num_dq * g_ip->activity_dq * io_param->c_data *
io_param->vdd_io * io_param->v_sw_data_write_load1 *
io_param->frequency / 1000 +
g_ip->num_dq * g_ip->activity_dq * ((g_ip->num_mem_dq-1) *
io_param->c_data) * io_param->vdd_io *
io_param->v_sw_data_write_load2 * io_param->frequency / 1000 +
g_ip->num_dq * g_ip->activity_dq * io_param->c_int *
io_param->vdd_io * io_param->vdd_io * io_param->frequency / 1000;
power_dqs_write = g_ip->num_dqs * (io_param->c_tx + c_line) *
io_param->vdd_io * io_param->v_sw_data_write_line *
io_param->frequency / 1000 +
g_ip->num_dqs * io_param->c_data * io_param->vdd_io *
io_param->v_sw_data_write_load1 * io_param->frequency / 1000 +
g_ip->num_dqs * ((g_ip->num_mem_dq-1) * io_param->c_data) *
io_param->vdd_io * io_param->v_sw_data_write_load2 *
io_param->frequency / 1000 +
g_ip->num_dqs * io_param->c_int * io_param->vdd_io *
io_param->vdd_io * io_param->frequency / 1000;
power_ca_write = g_ip->num_ca * g_ip->activity_ca *
(io_param->c_tx + io_param->num_mem_ca * io_param->c_addr +
c_line_ca) *
io_param->vdd_io * io_param->v_sw_addr * io_param->frequency / 1000 +
g_ip->num_ca * g_ip->activity_ca * io_param->c_int *
io_param->vdd_io * io_param->vdd_io * io_param->frequency / 1000;
power_dq_read = g_ip->num_dq * g_ip->activity_dq *
(io_param->c_tx + c_line) * io_param->vdd_io *
io_param->v_sw_data_read_line * io_param->frequency / 1000.0 +
g_ip->num_dq * g_ip->activity_dq * io_param->c_data *
io_param->vdd_io * io_param->v_sw_data_read_load1 * io_param->frequency / 1000.0 +
g_ip->num_dq *g_ip->activity_dq * ((g_ip->num_mem_dq-1) * io_param->c_data) *
io_param->vdd_io * io_param->v_sw_data_read_load2 * io_param->frequency / 1000.0 +
g_ip->num_dq * g_ip->activity_dq * io_param->c_int * io_param->vdd_io *
io_param->vdd_io * io_param->frequency / 1000.0;
power_dqs_read = g_ip->num_dqs * (io_param->c_tx + c_line) *
io_param->vdd_io * io_param->v_sw_data_read_line *
io_param->frequency / 1000.0 +
g_ip->num_dqs * io_param->c_data * io_param->vdd_io *
io_param->v_sw_data_read_load1 * io_param->frequency / 1000.0 +
g_ip->num_dqs * ((g_ip->num_mem_dq-1) * io_param->c_data) *
io_param->vdd_io * io_param->v_sw_data_read_load2 * io_param->frequency / 1000.0 +
g_ip->num_dqs * io_param->c_int * io_param->vdd_io * io_param->vdd_io *
io_param->frequency / 1000.0;
power_ca_read = g_ip->num_ca * g_ip->activity_ca *
(io_param->c_tx + io_param->num_mem_ca *
io_param->c_addr + c_line_ca) *
io_param->vdd_io * io_param->v_sw_addr * io_param->frequency / 1000 +
g_ip->num_ca * g_ip->activity_ca * io_param->c_int *
io_param->vdd_io * io_param->vdd_io * io_param->frequency / 1000;
power_clk = g_ip->num_clk *
(io_param->c_tx + io_param->num_mem_clk *
io_param->c_data + c_line) *
io_param->vdd_io * io_param->v_sw_clk *io_param->frequency / 1000 +
g_ip->num_clk * io_param->c_int * io_param->vdd_io *
io_param->vdd_io * io_param->frequency / 1000;
}
//Combining the power terms based on STATE (READ/WRITE/IDLE/SLEEP)
if (g_ip->iostate == READ) {
io_power_dynamic = g_ip->duty_cycle * (power_dq_read +
power_ca_read + power_dqs_read + power_clk);
}
else if (g_ip->iostate == WRITE) {
io_power_dynamic = g_ip->duty_cycle *
(power_dq_write + power_ca_write + power_dqs_write + power_clk);
}
else if (g_ip->iostate == IDLE) {
io_power_dynamic = g_ip->duty_cycle * (power_clk);
}
else if (g_ip->iostate == SLEEP) {
io_power_dynamic = 0;
}
else {
io_power_dynamic = 0;
}
printf("IO Dynamic Power (mW) = ");
cout << io_power_dynamic << " ";
}
//External IO Timing and Voltage Margins. More details can be found in the CACTI-IO technical report (), Chapter 2.2.
void Extio::extio_eye()
{
if (io_param->io_type == Serial)
{io_vmargin=0;
}
else
{
//VOLTAGE MARGINS
//Voltage noise calculations based on proportional and independent noise
//sources for WRITE, READ and CA
double v_noise_write = io_param->k_noise_write_sen * io_param->v_sw_data_write_line +
io_param->v_noise_independent_write;
double v_noise_read = io_param->k_noise_read_sen * io_param->v_sw_data_read_line +
io_param->v_noise_independent_read;
double v_noise_addr = io_param->k_noise_addr_sen * io_param->v_sw_addr +
io_param->v_noise_independent_addr;
//Worst-case voltage margin (Swing/2 - Voltage noise) calculations per state
//depending on DQ voltage margin and CA voltage margin (lesser or the two is
//reported)
if (g_ip->iostate == READ)
{
if ((io_param->v_sw_data_read_line/2 - v_noise_read) <
(io_param->v_sw_addr/2 - v_noise_addr)) {
io_vmargin = io_param->v_sw_data_read_line/2 - v_noise_read;
}
else {
io_vmargin = io_param->v_sw_addr/2 - v_noise_addr;
}
}
else if (g_ip->iostate == WRITE) {
if ((io_param->v_sw_data_write_line/2 - v_noise_write) <
(io_param->v_sw_addr/2 - v_noise_addr)) {
io_vmargin = io_param->v_sw_data_write_line/2 - v_noise_write;
}
else {
io_vmargin = io_param->v_sw_addr/2 - v_noise_addr;
}
}
else {
io_vmargin = 0;
}
}
//TIMING MARGINS
double t_margin_write_setup,t_margin_write_hold,t_margin_read_setup
,t_margin_read_hold,t_margin_addr_setup,t_margin_addr_hold;
if (io_param->io_type == Serial)
{
t_margin_write_setup = (1e6/(4*io_param->frequency)) -
io_param->t_ds -
io_param->t_jitter_setup_sen;
t_margin_write_hold = (1e6/(4*io_param->frequency)) -
io_param->t_dh - io_param->t_dcd_soc -
io_param->t_jitter_hold_sen;
t_margin_read_setup = (1e6/(4*io_param->frequency)) -
io_param->t_soc_setup -
io_param->t_jitter_setup_sen;
t_margin_read_hold = (1e6/(4*io_param->frequency)) -
io_param->t_soc_hold - io_param->t_dcd_dram -
io_param->t_dcd_soc -
io_param->t_jitter_hold_sen;
t_margin_addr_setup = (1e6*g_ip->addr_timing/(2*io_param->frequency));
t_margin_addr_hold = (1e6*g_ip->addr_timing/(2*io_param->frequency));
}
else
{
//Setup and Hold timing margins for DQ WRITE, DQ READ and CA based on timing
//budget
t_margin_write_setup = (1e6/(4*io_param->frequency)) -
io_param->t_ds - io_param->t_error_soc -
io_param->t_jitter_setup_sen - io_param->t_skew_setup + io_param->t_cor_margin;
t_margin_write_hold = (1e6/(4*io_param->frequency)) -
io_param->t_dh - io_param->t_dcd_soc - io_param->t_error_soc -
io_param->t_jitter_hold_sen - io_param->t_skew_hold + io_param->t_cor_margin;
t_margin_read_setup = (1e6/(4*io_param->frequency)) -
io_param->t_soc_setup - io_param->t_error_soc -
io_param->t_jitter_setup_sen - io_param->t_skew_setup -
io_param->t_dqsq + io_param->t_cor_margin;
t_margin_read_hold = (1e6/(4*io_param->frequency)) -
io_param->t_soc_hold - io_param->t_dcd_dram -
io_param->t_dcd_soc - io_param->t_error_soc -
io_param->t_jitter_hold_sen - io_param->t_skew_hold + io_param->t_cor_margin;
t_margin_addr_setup = (1e6*g_ip->addr_timing/(2*io_param->frequency)) -
io_param->t_is - io_param->t_error_soc -
io_param->t_jitter_addr_setup_sen - io_param->t_skew_setup + io_param->t_cor_margin;
t_margin_addr_hold = (1e6*g_ip->addr_timing/(2*io_param->frequency)) -
io_param->t_ih - io_param->t_dcd_soc - io_param->t_error_soc -
io_param->t_jitter_addr_hold_sen - io_param->t_skew_hold + io_param->t_cor_margin;
}
//Worst-case timing margin per state depending on DQ and CA timing margins
if (g_ip->iostate == READ) {
io_tmargin = t_margin_read_setup < t_margin_read_hold ?
t_margin_read_setup : t_margin_read_hold;
io_tmargin = io_tmargin < t_margin_addr_setup ?
io_tmargin : t_margin_addr_setup;
io_tmargin = io_tmargin < t_margin_addr_hold ?
io_tmargin : t_margin_addr_hold;
}
else if (g_ip->iostate == WRITE) {
io_tmargin = t_margin_write_setup < t_margin_write_hold ?
t_margin_write_setup : t_margin_write_hold;
io_tmargin = io_tmargin < t_margin_addr_setup ?
io_tmargin : t_margin_addr_setup;
io_tmargin = io_tmargin < t_margin_addr_hold ?
io_tmargin : t_margin_addr_hold;
}
else {
io_tmargin = 0;
}
//OUTPUTS
printf("IO Timing Margin (ps) = ");
cout << io_tmargin <<endl;
printf("IO Votlage Margin (V) = ");
cout << io_vmargin << endl;
}

46
T1/TP/TP1/cacti_7/extio.h Normal file
View file

@ -0,0 +1,46 @@
#ifndef _extio_H_
#define _extio_H_
#include "parameter.h"
#include "component.h"
#include "extio_technology.h"
class Extio : public Component
{
public:
Extio(IOTechParam *);
void extio_area();
void extio_eye();
void extio_power_dynamic();
void extio_power_phy();
void extio_power_term();
private:
IOTechParam *io_param;
double io_area;
double io_power_term;
double power_termination_write;
double power_termination_read;
double power_bias;
double power_clk_bias;
double phy_power;
double phy_wtime;
double phy_static_power;
double phy_dynamic_power;
double io_power_dynamic;
double power_dq_write, power_dqs_write, power_ca_write,
power_dq_read, power_dqs_read, power_ca_read,
power_clk;
double io_tmargin, io_vmargin;
};
#endif // _extio_H_

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,225 @@
#ifndef __EXTIO_TECH__
#define __EXTIO_TECH__
#include <iostream>
#include "parameter.h"
#include "const.h"
#define NUM_DIMM 1
extern const double rtt1_wr_lrdimm_ddr3[8][4];
extern const double rtt2_wr_lrdimm_ddr3[8][4];
extern const double rtt1_rd_lrdimm_ddr3[8][4];
extern const double rtt2_rd_lrdimm_ddr3[8][4];
extern const double rtt1_wr_host_dimm_ddr3[3][4];
extern const double rtt2_wr_host_dimm_ddr3[3][4];
extern const double rtt1_rd_host_dimm_ddr3[3][4];
extern const double rtt2_rd_host_dimm_ddr3[3][4];
extern const double rtt1_wr_bob_dimm_ddr3[3][4];
extern const double rtt2_wr_bob_dimm_ddr3[3][4];
extern const double rtt1_rd_bob_dimm_ddr3[3][4];
extern const double rtt2_rd_bob_dimm_ddr3[3][4];
extern const double rtt1_wr_lrdimm_ddr4[8][4];
extern const double rtt2_wr_lrdimm_ddr4[8][4];
extern const double rtt1_rd_lrdimm_ddr4[8][4];
extern const double rtt2_rd_lrdimm_ddr4[8][4];
extern const double rtt1_wr_host_dimm_ddr4[3][4];
extern const double rtt2_wr_host_dimm_ddr4[3][4];
extern const double rtt1_rd_host_dimm_ddr4[3][4];
extern const double rtt2_rd_host_dimm_ddr4[3][4];
extern const double rtt1_wr_bob_dimm_ddr4[3][4];
extern const double rtt2_wr_bob_dimm_ddr4[3][4];
extern const double rtt1_rd_bob_dimm_ddr4[3][4];
extern const double rtt2_rd_bob_dimm_ddr4[3][4];
class IOTechParam
{
public:
IOTechParam(InputParameter *);
// connection : 0(bob-dimm), 1(host-dimm), 2(on-dimm)
IOTechParam(InputParameter *, Mem_IO_type io_type, int num_mem_dq, int mem_data_width, int num_dq, int connection, int num_loads, double freq) ;
~IOTechParam();
double num_mem_ca; /* Number of loads on the address bus
based on total number of memories in the channel.For
registered or buffered configurations, the num_mem_dq and num_mem_ca is per buffer. */
double num_mem_clk; /* Number of loads on the clock as total
memories in the channel / number of clock lines available */
//Technology Parameters
// IO Supply voltage (V)
double vdd_io; /* Voltage swing on CLK/CLKB (V) (swing on the CLK pin if it
is differentially terminated) */
double v_sw_clk;
// Loading parameters
double c_int; /*Internal IO loading (pF) (loading within the IO, due to
predriver nets) */
double c_tx; /* IO TX self-load including package (pF) (loading at the
CPU TX pin) */
double c_data; /* Device loading per memory data pin (pF) (DRAM device
load for DQ per die) */
double c_addr; /* Device loading per memory address pin (pF) (DRAM
device load for CA per die) */
double i_bias; /* Bias current (mA) (includes bias current for the whole memory
bus due to RX Vref based receivers */
double i_leak; // Active leakage current per pin (nA)
// IO Area coefficients
double ioarea_c; /* sq.mm. (IO Area baseline coeeficient for control
circuitry and overhead) */
double ioarea_k0; /* sq.mm * ohms (IO Area coefficient for the driver, for
unit drive strength or output impedance) */
double ioarea_k1; /* sq.mm * ohms / MHz (IO Area coefficient for the
predriver final stage, based on fanout needed) */
double ioarea_k2; /* sq.mm * ohms / MHz^2 (IO Area coefficient for
predriver middle stage, based on fanout needed) */
double ioarea_k3; /* sq.mm * ohms / MHz^3 (IO Area coefficient for
predriver first stage, based on fanout needed) */
// Timing parameters (ps)
double t_ds; //DQ setup time at DRAM
double t_is; //CA setup time at DRAM
double t_dh; //DQ hold time at DRAM
double t_ih; //CA hold time at DRAM
double t_dcd_soc; //Duty-cycle distortion at the CPU/SOC
double t_dcd_dram; //Duty-cycle distortion at the DRAM
double t_error_soc; //Timing error due to edge placement uncertainty of the DLL
double t_skew_setup;//Setup skew between DQ/DQS or CA/CLK after deskewing the lines
double t_skew_hold; //Hold skew between DQ/DQS or CA/CLK after deskewing the lines
double t_dqsq; //DQ-DQS skew at the DRAM output during Read
//double t_qhs; //DQ-DQS hold factor at the DRAM output during Read FIXME: I am commenting it as the variable is never used.
double t_soc_setup; //Setup time at SOC input dueing Read
double t_soc_hold; //Hold time at SOC input during Read
double t_jitter_setup; /* Half-cycle jitter on the DQS at DRAM input
affecting setup time */
double t_jitter_hold; /* Half-cycle jitter on the DQS at the DRAM input
affecting hold time */
double t_jitter_addr_setup; /* Half-cycle jitter on the CLK at DRAM input
affecting setup time */
double t_jitter_addr_hold; /* Half-cycle jitter on the CLK at the DRAM
input affecting hold time */
double t_cor_margin; // Statistical correlation margin
//Termination Parameters
double r_diff_term; /* Differential termination resister if
used for CLK (Ohm) */
// ODT related termination resistor values (Ohm)
double rtt1_dq_read; //DQ Read termination at CPU
double rtt2_dq_read; //DQ Read termination at inactive DRAM
double rtt1_dq_write; //DQ Write termination at active DRAM
double rtt2_dq_write; //DQ Write termination at inactive DRAM
double rtt_ca; //CA fly-by termination
double rs1_dq; //Series resistor at active DRAM
double rs2_dq; //Series resistor at inactive DRAM
double r_stub_ca; //Series resistor for the fly-by channel
double r_on; //Driver impedance
double r_on_ca; //CA driver impedance
double z0; //Line impedance (ohms): Characteristic impedance of the route.
double t_flight; /* Flight time of the interconnect (ns) (approximately
180ps/inch for FR4) */
double t_flight_ca; /* Flight time of the Control/Address (CA)
interconnect (ns) (approximately 180ps/inch for FR4) */
// Voltage noise coeffecients
double k_noise_write; //Proportional noise coefficient for Write mode
double k_noise_read; //Proportional noise coefficient for Read mode
double k_noise_addr; //Proportional noise coefficient for Address bus
double v_noise_independent_write; //Independent noise voltage for Write mode
double v_noise_independent_read; //Independent noise voltage for Read mode
double v_noise_independent_addr; //Independent noise voltage for Address bus
//SENSITIVITY INPUTS FOR TIMING AND VOLTAGE NOISE
/* This is a user-defined section that depends on the channel sensitivity
* to IO and DRAM parameters. The t_jitter_* and k_noise_* are the
* parameters that are impacted based on the channel analysis. The user
* can define any relationship between the termination, loading and
* configuration parameters AND the t_jitter/k_noise parameters. */
double k_noise_write_sen;
double k_noise_read_sen;
double k_noise_addr_sen;
double t_jitter_setup_sen;
double t_jitter_hold_sen;
double t_jitter_addr_setup_sen;
double t_jitter_addr_hold_sen;
//SWING AND TERMINATION CALCULATIONS
//R|| calculation
double rpar_write;
double rpar_read;
//Swing calculation
double v_sw_data_read_load1; //Swing for DQ at dram1 during READ
double v_sw_data_read_load2; //Swing for DQ at dram2 during READ
double v_sw_data_read_line; //Swing for DQ on the line during READ
double v_sw_addr; //Swing for the address bus
double v_sw_data_write_load1; //Swing for DQ at dram1 during WRITE
double v_sw_data_write_load2; //Swing for DQ at dram2 during WRITE
double v_sw_data_write_line; //Swing for DQ on the line during WRITE
// PHY Static Power Coefficients (mW)
double phy_datapath_s; // Datapath Static Power
double phy_phase_rotator_s; // Phase Rotator Static Power
double phy_clock_tree_s; // Clock Tree Static Power
double phy_rx_s; // Receiver Static Power
double phy_dcc_s; // Duty Cycle Correction Static Power
double phy_deskew_s; // Deskewing Static Power
double phy_leveling_s; // Write and Read Leveling Static Power
double phy_pll_s; // PHY PLL Static Power
// PHY Dynamic Power Coefficients (mW/Gbps)
double phy_datapath_d; // Datapath Dynamic Power
double phy_phase_rotator_d; // Phase Rotator Dynamic Power
double phy_clock_tree_d; // Clock Tree Dynamic Power
double phy_rx_d; // Receiver Dynamic Power
double phy_dcc_d; // Duty Cycle Correction Dynamic Power
double phy_deskew_d; // Deskewing Dynamic Power
double phy_leveling_d; // Write and Read Leveling Dynamic Power
double phy_pll_d; // PHY PLL Dynamic Power
//PHY Wakeup Times (Sleep to Active) (microseconds)
double phy_pll_wtime; // PHY PLL Wakeup Time
double phy_phase_rotator_wtime; // Phase Rotator Wakeup Time
double phy_rx_wtime; // Receiver Wakeup Time
double phy_bandgap_wtime; // Bandgap Wakeup Time
double phy_deskew_wtime; // Deskewing Wakeup Time
double phy_vrefgen_wtime; // VREF Generator Wakeup Time
// RTT values depends on the number of loads, frequency, and link_type
double frequency;
Mem_IO_type io_type;
int frequnecy_index(Mem_IO_type type);
};
#endif

640
T1/TP/TP1/cacti_7/htree2.cc Normal file
View file

@ -0,0 +1,640 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "htree2.h"
#include "wire.h"
#include <assert.h>
#include <iostream>
Htree2::Htree2(
enum Wire_type wire_model, double mat_w, double mat_h,
int a_bits, int d_inbits, int search_data_in, int d_outbits, int search_data_out, int bl, int wl, enum Htree_type htree_type,
bool uca_tree_, bool search_tree_, /*TechnologyParameter::*/DeviceType *dt)
:in_rise_time(0), out_rise_time(0),
tree_type(htree_type), mat_width(mat_w), mat_height(mat_h),
add_bits(a_bits), data_in_bits(d_inbits), search_data_in_bits(search_data_in),data_out_bits(d_outbits),
search_data_out_bits(search_data_out), ndbl(bl), ndwl(wl),
uca_tree(uca_tree_), search_tree(search_tree_), wt(wire_model), deviceType(dt)
{
assert(ndbl >= 2 && ndwl >= 2);
// if (ndbl == 1 && ndwl == 1)
// {
// delay = 0;
// power.readOp.dynamic = 0;
// power.readOp.leakage = 0;
// area.w = mat_w;
// area.h = mat_h;
// return;
// }
// if (ndwl == 1) ndwl++;
// if (ndbl == 1) ndbl++;
max_unpipelined_link_delay = 0; //TODO
min_w_nmos = g_tp.min_w_nmos_;
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
switch (htree_type)
{
case Add_htree:
wire_bw = init_wire_bw = add_bits;
in_htree();
break;
case Data_in_htree:
wire_bw = init_wire_bw = data_in_bits;
in_htree();
break;
case Data_out_htree:
wire_bw = init_wire_bw = data_out_bits;
out_htree();
break;
case Search_in_htree:
wire_bw = init_wire_bw = search_data_in_bits;//in_search_tree is broad cast, out_htree is not.
in_htree();
break;
case Search_out_htree:
wire_bw = init_wire_bw = search_data_out_bits;
out_htree();
break;
default:
assert(0);
break;
}
power_bit = power;
power.readOp.dynamic *= init_wire_bw;
assert(power.readOp.dynamic >= 0);
assert(power.readOp.leakage >= 0);
}
// nand gate sizing calculation
void Htree2::input_nand(double s1, double s2, double l_eff)
{
Wire w1(wt, l_eff);
double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
// input capacitance of a repeater = input capacitance of nand.
double nsize = s1*(1 + pton_size)/(2 + pton_size);
nsize = (nsize < 1) ? 1 : nsize;
double tc = 2*tr_R_on(nsize*min_w_nmos, NCH, 1) *
(drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
2 * gate_C(s2*(min_w_nmos + min_w_pmos), 0));
delay+= horowitz (w1.out_rise_time, tc,
deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
power.readOp.dynamic += 0.5 *
(2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+ drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+ 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd;
power.searchOp.dynamic += 0.5 *
(2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+ drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+ 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd * wire_bw ;
power.readOp.leakage += (wire_bw*cmos_Isub_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
power.readOp.gate_leakage += (wire_bw*cmos_Ig_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
}
// tristate buffer model consisting of not, nand, nor, and driver transistors
void Htree2::output_buffer(double s1, double s2, double l_eff)
{
Wire w1(wt, l_eff);
double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
// input capacitance of repeater = input capacitance of nand + nor.
double size = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
double s_eff = //stage eff of a repeater in a wire
(gate_C(s2*(min_w_nmos + min_w_pmos), 0) + w1.wire_cap(l_eff*1e-6,true))/
gate_C(s2*(min_w_nmos + min_w_pmos), 0);
double tr_size = gate_C(s1*(min_w_nmos + min_w_pmos), 0) * 1/2/(s_eff*gate_C(min_w_pmos, 0));
size = (size < 1) ? 1 : size;
double res_nor = 2*tr_R_on(size*min_w_pmos, PCH, 1);
double res_ptrans = tr_R_on(tr_size*min_w_nmos, NCH, 1);
double cap_nand_out = drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
gate_C(tr_size*min_w_pmos, 0);
double cap_ptrans_out = 2 *(drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)) +
gate_C(s1*(min_w_nmos + min_w_pmos), 0);
double tc = res_nor * cap_nand_out + (res_nor + res_ptrans) * cap_ptrans_out;
delay += horowitz (w1.out_rise_time, tc,
deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
//nand
power.readOp.dynamic += 0.5 *
(2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
gate_C(tr_size*(min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd;
power.searchOp.dynamic += 0.5 *
(2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
gate_C(tr_size*(min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
//not
power.readOp.dynamic += 0.5 *
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd;
power.searchOp.dynamic += 0.5 *
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
//nor
power.readOp.dynamic += 0.5 *
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+ 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd;
power.searchOp.dynamic += 0.5 *
(drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+ 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
//output transistor
power.readOp.dynamic += 0.5 *
((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
+ gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd;
power.searchOp.dynamic += 0.5 *
((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
+ gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
deviceType->Vdd * deviceType->Vdd*init_wire_bw;
if(uca_tree) {
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
//power.readOp.gate_leakage *=;
}
else {
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
//power.readOp.gate_leakage *=deviceType->Vdd*wire_bw;
}
}
/* calculates the input h-tree delay/power
* A nand gate is used at each node to
* limit the signal
* The area of an unbalanced htree (rows != columns)
* depends on how data is traversed.
* In the following function, if ( no. of rows < no. of columns),
* then data first traverse in excess hor. links until vertical
* and horizontal nodes are same.
* If no. of rows is bigger, then data traverse in
* a hor. link followed by a ver. link in a repeated
* fashion (similar to a balanced tree) until there are no
* hor. links left. After this it goes through the remaining vertical
* links.
*/
void
Htree2::in_htree()
{
//temp var
double s1 = 0, s2 = 0, s3 = 0;
double l_eff = 0;
Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
double len = 0, ht = 0;
int option = 0;
int h = (int) _log2(ndwl/2); // horizontal nodes
int v = (int) _log2(ndbl/2); // vertical nodes
double len_temp;
double ht_temp;
if (uca_tree)
{//: this computation do not consider the wires that route from edge to middle.
ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
2 * (1-pow(0.5,h))))/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
2 * (1-pow(0.5,v))))/2;
}
else
{
if (ndwl == ndbl) {
ht_temp = ((mat_height*ndbl/2) +
((add_bits + (search_data_in_bits + search_data_out_bits))* (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
)/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
}
else if (ndwl > ndbl) {
double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
ht_temp = ((mat_height*ndbl/2) +
((add_bits + + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
(2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
}
else {
double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
ht_temp = ((mat_height*ndbl/2) +
((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
)/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
}
}
area.h = ht_temp * 2;
area.w = len_temp * 2;
delay = 0;
power.readOp.dynamic = 0;
power.readOp.leakage = 0;
power.searchOp.dynamic =0;
len = len_temp;
ht = ht_temp/2;
while (v > 0 || h > 0)
{
if (wtemp1) delete wtemp1;
if (wtemp2) delete wtemp2;
if (wtemp3) delete wtemp3;
if (h > v)
{
//the iteration considers only one horizontal link
wtemp1 = new Wire(wt, len); // hor
wtemp2 = new Wire(wt, len/2); // ver
len_temp = len;
len /= 2;
wtemp3 = 0;
h--;
option = 0;
}
else if (v>0 && h>0)
{
//considers one horizontal link and one vertical link
wtemp1 = new Wire(wt, len); // hor
wtemp2 = new Wire(wt, ht); // ver
wtemp3 = new Wire(wt, len/2); // next hor
len_temp = len;
ht_temp = ht;
len /= 2;
ht /= 2;
v--;
h--;
option = 1;
}
else
{
// considers only one vertical link
assert(h == 0);
wtemp1 = new Wire(wt, ht); // ver
wtemp2 = new Wire(wt, ht/2); // hor
ht_temp = ht;
ht /= 2;
wtemp3 = 0;
v--;
option = 2;
}
delay += wtemp1->delay;
power.readOp.dynamic += wtemp1->power.readOp.dynamic;
power.searchOp.dynamic += wtemp1->power.readOp.dynamic*wire_bw;
power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
if ((uca_tree == false && option == 2) || search_tree==true)
{
wire_bw*=2; // wire bandwidth doubles only for vertical branches
}
if (uca_tree == false)
{
if (len_temp > wtemp1->repeater_spacing)
{
s1 = wtemp1->repeater_size;
l_eff = wtemp1->repeater_spacing;
}
else
{
s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
l_eff = len_temp;
}
if (ht_temp > wtemp2->repeater_spacing)
{
s2 = wtemp2->repeater_size;
}
else
{
s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
}
// first level
input_nand(s1, s2, l_eff);
}
if (option != 1)
{
continue;
}
// second level
delay += wtemp2->delay;
power.readOp.dynamic += wtemp2->power.readOp.dynamic;
power.searchOp.dynamic += wtemp2->power.readOp.dynamic*wire_bw;
power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
if (uca_tree)
{
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
}
else
{
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
wire_bw*=2;
if (ht_temp > wtemp3->repeater_spacing)
{
s3 = wtemp3->repeater_size;
l_eff = wtemp3->repeater_spacing;
}
else
{
s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
l_eff = ht_temp;
}
input_nand(s2, s3, l_eff);
}
}
if (wtemp1) delete wtemp1;
if (wtemp2) delete wtemp2;
if (wtemp3) delete wtemp3;
}
/* a tristate buffer is used to handle fan-ins
* The area of an unbalanced htree (rows != columns)
* depends on how data is traversed.
* In the following function, if ( no. of rows < no. of columns),
* then data first traverse in excess hor. links until vertical
* and horizontal nodes are same.
* If no. of rows is bigger, then data traverse in
* a hor. link followed by a ver. link in a repeated
* fashion (similar to a balanced tree) until there are no
* hor. links left. After this it goes through the remaining vertical
* links.
*/
void Htree2::out_htree()
{
//temp var
double s1 = 0, s2 = 0, s3 = 0;
double l_eff = 0;
Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
double len = 0, ht = 0;
int option = 0;
int h = (int) _log2(ndwl/2);
int v = (int) _log2(ndbl/2);
double len_temp;
double ht_temp;
if (uca_tree)
{
ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
2 * (1-pow(0.5,h))))/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
2 * (1-pow(0.5,v))))/2;
}
else
{
if (ndwl == ndbl) {
ht_temp = ((mat_height*ndbl/2) +
((add_bits+ (search_data_in_bits + search_data_out_bits)) * (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
)/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
}
else if (ndwl > ndbl) {
double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
ht_temp = ((mat_height*ndbl/2) +
((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
(2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
}
else {
double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
ht_temp = ((mat_height*ndbl/2) +
((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
)/2;
len_temp = (mat_width*ndwl/2 +
((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
(data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
}
}
area.h = ht_temp * 2;
area.w = len_temp * 2;
delay = 0;
power.readOp.dynamic = 0;
power.readOp.leakage = 0;
power.readOp.gate_leakage = 0;
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
len = len_temp;
ht = ht_temp/2;
while (v > 0 || h > 0)
{ //finds delay/power of each link in the tree
if (wtemp1) delete wtemp1;
if (wtemp2) delete wtemp2;
if (wtemp3) delete wtemp3;
if(h > v) {
//the iteration considers only one horizontal link
wtemp1 = new Wire(wt, len); // hor
wtemp2 = new Wire(wt, len/2); // ver
len_temp = len;
len /= 2;
wtemp3 = 0;
h--;
option = 0;
}
else if (v>0 && h>0) {
//considers one horizontal link and one vertical link
wtemp1 = new Wire(wt, len); // hor
wtemp2 = new Wire(wt, ht); // ver
wtemp3 = new Wire(wt, len/2); // next hor
len_temp = len;
ht_temp = ht;
len /= 2;
ht /= 2;
v--;
h--;
option = 1;
}
else {
// considers only one vertical link
assert(h == 0);
wtemp1 = new Wire(wt, ht); // hor
wtemp2 = new Wire(wt, ht/2); // ver
ht_temp = ht;
ht /= 2;
wtemp3 = 0;
v--;
option = 2;
}
delay += wtemp1->delay;
power.readOp.dynamic += wtemp1->power.readOp.dynamic;
power.searchOp.dynamic += wtemp1->power.readOp.dynamic*init_wire_bw;
power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
if ((uca_tree == false && option == 2) || search_tree==true)
{
wire_bw*=2;
}
if (uca_tree == false)
{
if (len_temp > wtemp1->repeater_spacing)
{
s1 = wtemp1->repeater_size;
l_eff = wtemp1->repeater_spacing;
}
else
{
s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
l_eff = len_temp;
}
if (ht_temp > wtemp2->repeater_spacing)
{
s2 = wtemp2->repeater_size;
}
else
{
s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
}
// first level
output_buffer(s1, s2, l_eff);
}
if (option != 1)
{
continue;
}
// second level
delay += wtemp2->delay;
power.readOp.dynamic += wtemp2->power.readOp.dynamic;
power.searchOp.dynamic += wtemp2->power.readOp.dynamic*init_wire_bw;
power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
if (uca_tree)
{
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
}
else
{
power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
wire_bw*=2;
if (ht_temp > wtemp3->repeater_spacing)
{
s3 = wtemp3->repeater_size;
l_eff = wtemp3->repeater_spacing;
}
else
{
s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
l_eff = ht_temp;
}
output_buffer(s2, s3, l_eff);
}
//cout<<"power.readOp.leakage"<<power.readOp.leakage<<endl;
//cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
//cout<<"wtemp2->power.readOp.gate_leakage"<<wtemp2->power.readOp.gate_leakage<<endl;
}
if (wtemp1) delete wtemp1;
if (wtemp2) delete wtemp2;
if (wtemp3) delete wtemp3;
}

View file

@ -0,0 +1,97 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __HTREE2_H__
#define __HTREE2_H__
#include "basic_circuit.h"
#include "component.h"
#include "parameter.h"
#include "assert.h"
#include "subarray.h"
#include "cacti_interface.h"
#include "wire.h"
// leakge power includes entire htree in a bank (when uca_tree == false)
// leakge power includes only part to one bank when uca_tree == true
class Htree2 : public Component
{
public:
Htree2(enum Wire_type wire_model,
double mat_w, double mat_h, int add, int data_in, int search_data_in, int data_out, int search_data_out, int bl, int wl,
enum Htree_type h_type, bool uca_tree_ = false, bool search_tree_ = false,
/*TechnologyParameter::*/DeviceType *dt = &(g_tp.peri_global));
~Htree2() {};
void in_htree();
void out_htree();
// repeaters only at h-tree nodes
void limited_in_htree();
void limited_out_htree();
void input_nand(double s1, double s2, double l);
void output_buffer(double s1, double s2, double l);
double in_rise_time, out_rise_time;
void set_in_rise_time(double rt)
{
in_rise_time = rt;
}
double max_unpipelined_link_delay;
powerDef power_bit;
private:
double wire_bw;
double init_wire_bw; // bus width at root
enum Htree_type tree_type;
double htree_hnodes;
double htree_vnodes;
double mat_width;
double mat_height;
int add_bits, data_in_bits,search_data_in_bits,data_out_bits, search_data_out_bits;
int ndbl, ndwl;
bool uca_tree; // should have full bandwidth to access all banks in the array simultaneously
bool search_tree;
enum Wire_type wt;
double min_w_nmos;
double min_w_pmos;
/*TechnologyParameter::*/DeviceType *deviceType;
};
#endif

3790
T1/TP/TP1/cacti_7/io.cc Normal file

File diff suppressed because it is too large Load diff

45
T1/TP/TP1/cacti_7/io.h Normal file
View file

@ -0,0 +1,45 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __IO_H__
#define __IO_H__
#include "const.h"
#include "cacti_interface.h"
void output_data_csv(const uca_org_t & fin_res, string fn="out.csv");
void output_UCA(uca_org_t * fin_res);
void output_data_csv_3dd(const uca_org_t & fin_res);
#endif

254
T1/TP/TP1/cacti_7/lpddr.cfg Normal file
View file

@ -0,0 +1,254 @@
# Cache size
//-size (bytes) 2048
//-size (bytes) 4096
//-size (bytes) 32768
//-size (bytes) 131072
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
# power gating
-Array Power Gating - "false"
-WL Power Gating - "false"
-CL Power Gating - "false"
-Bitline floating - "false"
-Interconnect Power Gating - "false"
-Power Gating Performance Loss 0.01
# Line size
//-block size (bytes) 8
-block size (bytes) 64
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
//-associativity 8
-associativity 8
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 1
-technology (u) 0.022
//-technology (u) 0.040
//-technology (u) 0.032
//-technology (u) 0.090
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
//-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
//-Tag array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
//-Tag array peripheral type - "itrs-lop
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
-output/input bus width 512
// 300-400 in steps of 10
-operating temperature (K) 360
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 22
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
//-Wire outside mat - "global"
-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
#### Default CONFIGURATION values for baseline external IO parameters to DRAM. More details can be found in the CACTI-IO technical report (), especially Chapters 2 and 3.
# Memory Type (D=DDR3, L=LPDDR2, W=WideIO). Additional memory types can be defined by the user in extio_technology.cc, along with their technology and configuration parameters.
//-dram_type "D"
-dram_type "L"
//-dram_type "W"
//-dram_type "S"
# Memory State (R=Read, W=Write, I=Idle or S=Sleep)
//-iostate "R"
-iostate "W"
//-iostate "I"
//-iostate "S"
#Address bus timing. To alleviate the timing on the command and address bus due to high loading (shared across all memories on the channel), the interface allows for multi-cycle timing options.
-addr_timing 0.5 //DDR
//-addr_timing 1.0 //SDR (half of DQ rate)
//-addr_timing 2.0 //2T timing (One fourth of DQ rate)
//-addr_timing 3.0 // 3T timing (One sixth of DQ rate)
# Memory Density (Gbit per memory/DRAM die)
-mem_density 8 Gb //Valid values 2^n Gb
# IO frequency (MHz) (frequency of the external memory interface).
-bus_freq 533 MHz //As of current memory standards (2013), valid range 0 to 1.5 GHz for DDR3, 0 to 533 MHz for LPDDR2, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential. However this can change, and the user is free to define valid ranges based on new memory types or extending beyond existing standards for existing dram types.
# Duty Cycle (fraction of time in the Memory State defined above)
-duty_cycle 1.0 //Valid range 0 to 1.0
# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
#-activity_dq .50 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_ca 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR, 0 to 0.25 for 2T, and 0 to 0.17 for 3T
#-activity_ca 0.25 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR, 0 to 0.25 for 2T, and 0 to 0.17 for 3T
# Number of DQ pins
-num_dq 72 //Number of DQ pins. Includes ECC pins.
# Number of DQS pins. DQS is a data strobe that is sent along with a small number of data-lanes so the source synchronous timing is local to these DQ bits. Typically, 1 DQS per byte (8 DQ bits) is used. The DQS is also typucally differential, just like the CLK pin.
-num_dqs 36 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.
# Number of CA pins
-num_ca 35 //Valid range 0 to 35 pins.
#-num_ca 25 //Valid range 0 to 35 pins.
# Number of CLK pins. CLK is typically a differential pair. In some cases additional CLK pairs may be used to limit the loading on the CLK pin.
-num_clk 2 //2 x differential pair. Valid values: 0/2/4.
# Number of Physical Ranks
-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per buffer/register. If multiple LRDIMMs or buffer chips exist, the analysis for capacity and power is reported per buffer/register.
# Width of the Memory Data Bus
-mem_data_width 32 //x4 or x8 or x16 or x32 memories. For WideIO upto x128.

270
T1/TP/TP1/cacti_7/main.cc Normal file
View file

@ -0,0 +1,270 @@
/*------------------------------------------------------------
* CACTI 6.5
* Copyright 2008 Hewlett-Packard Development Corporation
* All Rights Reserved
*
* Permission to use, copy, and modify this software and its documentation is
* hereby granted only under the following terms and conditions. Both the
* above copyright notice and this permission notice must appear in all copies
* of the software, derivative works or modified versions, and any portions
* thereof, and both notices must appear in supporting documentation.
*
* Users of this software agree to the terms and conditions set forth herein, and
* hereby grant back to Hewlett-Packard Company and its affiliated companies ("HP")
* a non-exclusive, unrestricted, royalty-free right and license under any changes,
* enhancements or extensions made to the core functions of the software, including
* but not limited to those affording compatibility with other hardware or software
* environments, but excluding applications which incorporate this software.
* Users further agree to use their best efforts to return to HP any such changes,
* enhancements or extensions that they make and inform HP of noteworthy uses of
* this software. Correspondence should be provided to HP at:
*
* Director of Intellectual Property Licensing
* Office of Strategy and Technology
* Hewlett-Packard Company
* 1501 Page Mill Road
* Palo Alto, California 94304
*
* This software may be distributed (but not offered for sale or transferred
* for compensation) to third parties, provided such third parties agree to
* abide by the terms and conditions of this notice.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND HP DISCLAIMS ALL
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL HP
* CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
* DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
* PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*------------------------------------------------------------*/
#include "io.h"
#include <iostream>
#include "Ucache.h"
using namespace std;
int main(int argc,char *argv[])
{
uca_org_t result;
if (argc != 53 && argc != 55 && argc !=64)
{
bool infile_specified = false;
string infile_name("");
for (int32_t i = 0; i < argc; i++)
{
if (argv[i] == string("-infile"))
{
infile_specified = true;
i++;
infile_name = argv[i];
}
}
if (infile_specified == false)
{
cerr << " Invalid arguments -- how to use CACTI:" << endl;
cerr << " 1) cacti -infile <input file name>" << endl;
cerr << " 2) cacti arg1 ... arg52 -- please refer to the README file" << endl;
cerr << " No. of arguments input - " << argc << endl;
exit(1);
}
else
{
result = cacti_interface(infile_name);
}
}
else if (argc == 53)
{
result = cacti_interface(atoi(argv[ 1]),
atoi(argv[ 2]),
atoi(argv[ 3]),
atoi(argv[ 4]),
atoi(argv[ 5]),
atoi(argv[ 6]),
atoi(argv[ 7]),
atoi(argv[ 8]),
atoi(argv[ 9]),
atof(argv[10]),
atoi(argv[11]),
atoi(argv[12]),
atoi(argv[13]),
atoi(argv[14]),
atoi(argv[15]),
atoi(argv[16]),
atoi(argv[17]),
atoi(argv[18]),
atoi(argv[19]),
atoi(argv[20]),
atoi(argv[21]),
atoi(argv[22]),
atoi(argv[23]),
atoi(argv[24]),
atoi(argv[25]),
atoi(argv[26]),
atoi(argv[27]),
atoi(argv[28]),
atoi(argv[29]),
atoi(argv[30]),
atoi(argv[31]),
atoi(argv[32]),
atoi(argv[33]),
atoi(argv[34]),
atoi(argv[35]),
atoi(argv[36]),
atoi(argv[37]),
atoi(argv[38]),
atoi(argv[39]),
atoi(argv[40]),
atoi(argv[41]),
atoi(argv[42]),
atoi(argv[43]),
atoi(argv[44]),
atoi(argv[45]),
atoi(argv[46]),
atoi(argv[47]),
atoi(argv[48]),
atoi(argv[49]),
atoi(argv[50]),
atoi(argv[51]),
atoi(argv[52]));
}
else if (argc == 55)
{
result = cacti_interface(atoi(argv[ 1]),
atoi(argv[ 2]),
atoi(argv[ 3]),
atoi(argv[ 4]),
atoi(argv[ 5]),
atoi(argv[ 6]),
atoi(argv[ 7]),
atoi(argv[ 8]),
atof(argv[ 9]),
atoi(argv[10]),
atoi(argv[11]),
atoi(argv[12]),
atoi(argv[13]),
atoi(argv[14]),
atoi(argv[15]),
atoi(argv[16]),
atoi(argv[17]),
atoi(argv[18]),
atoi(argv[19]),
atoi(argv[20]),
atoi(argv[21]),
atoi(argv[22]),
atoi(argv[23]),
atoi(argv[24]),
atoi(argv[25]),
atoi(argv[26]),
atoi(argv[27]),
atoi(argv[28]),
atoi(argv[29]),
atoi(argv[30]),
atoi(argv[31]),
atoi(argv[32]),
atoi(argv[33]),
atoi(argv[34]),
atoi(argv[35]),
atoi(argv[36]),
atoi(argv[37]),
atoi(argv[38]),
atoi(argv[39]),
atoi(argv[40]),
atoi(argv[41]),
atoi(argv[42]),
atoi(argv[43]),
atoi(argv[44]),
atoi(argv[45]),
atoi(argv[46]),
atoi(argv[47]),
atoi(argv[48]),
atoi(argv[49]),
atoi(argv[50]),
atoi(argv[51]),
atoi(argv[52]),
atoi(argv[53]),
atoi(argv[54]));
}
else if (argc == 64)
{
result = cacti_interface(atoi(argv[ 1]),
atoi(argv[ 2]),
atoi(argv[ 3]),
atoi(argv[ 4]),
atoi(argv[ 5]),
atoi(argv[ 6]),
atoi(argv[ 7]),
atoi(argv[ 8]),
atof(argv[ 9]),
atoi(argv[10]),
atoi(argv[11]),
atoi(argv[12]),
atoi(argv[13]),
atoi(argv[14]),
atoi(argv[15]),
atoi(argv[16]),
atoi(argv[17]),
atoi(argv[18]),
atoi(argv[19]),
atoi(argv[20]),
atoi(argv[21]),
atoi(argv[22]),
atoi(argv[23]),
atoi(argv[24]),
atoi(argv[25]),
atoi(argv[26]),
atoi(argv[27]),
atoi(argv[28]),
atoi(argv[29]),
atoi(argv[30]),
atoi(argv[31]),
atoi(argv[32]),
atoi(argv[33]),
atoi(argv[34]),
atoi(argv[35]),
atoi(argv[36]),
atoi(argv[37]),
atoi(argv[38]),
atoi(argv[39]),
atoi(argv[40]),
atoi(argv[41]),
atoi(argv[42]),
atoi(argv[43]),
atoi(argv[44]),
atoi(argv[45]),
atoi(argv[46]),
atoi(argv[47]),
atoi(argv[48]),
atoi(argv[49]),
atoi(argv[50]),
atoi(argv[51]),
atoi(argv[52]),
atoi(argv[53]),
atoi(argv[54]),
atoi(argv[55]),
atoi(argv[56]),
atoi(argv[57]),
atoi(argv[58]),
atoi(argv[59]),
atoi(argv[60]),
atoi(argv[61]),
atoi(argv[62]),
atoi(argv[63]));
}
cout << "=============================================\n\n";
// print_g_tp(); //function to test technology paramters.
// g_tp.display();
result.cleanup();
// delete result.data_array2;
// if (result.tag_array2!=NULL)
// delete result.tag_array2;
return 0;
}

View file

@ -0,0 +1,28 @@
TAR = cacti
.PHONY: dbg opt depend clean clean_dbg clean_opt
all: dbg
dbg: $(TAR).mk obj_dbg
@$(MAKE) TAG=dbg -C . -f $(TAR).mk
opt: $(TAR).mk obj_opt
@$(MAKE) TAG=opt -C . -f $(TAR).mk
obj_dbg:
mkdir $@
obj_opt:
mkdir $@
clean: clean_dbg clean_opt
clean_dbg: obj_dbg
@$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
rm -rf $<
clean_opt: obj_opt
@$(MAKE) TAG=opt -C . -f $(TAR).mk clean
rm -rf $<

1940
T1/TP/TP1/cacti_7/mat.cc Normal file

File diff suppressed because it is too large Load diff

176
T1/TP/TP1/cacti_7/mat.h Normal file
View file

@ -0,0 +1,176 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __MAT_H__
#define __MAT_H__
#include "component.h"
#include "decoder.h"
#include "wire.h"
#include "subarray.h"
#include "powergating.h"
class Mat : public Component
{
public:
Mat(const DynamicParameter & dyn_p);
~Mat();
double compute_delays(double inrisetime); // return outrisetime
void compute_power_energy();
const DynamicParameter & dp;
// TODO: clean up pointers and powerDefs below
Decoder * row_dec;
Decoder * bit_mux_dec;
Decoder * sa_mux_lev_1_dec;
Decoder * sa_mux_lev_2_dec;
PredecBlk * dummy_way_sel_predec_blk1;
PredecBlk * dummy_way_sel_predec_blk2;
PredecBlkDrv * way_sel_drv1;
PredecBlkDrv * dummy_way_sel_predec_blk_drv2;
Predec * r_predec;
Predec * b_mux_predec;
Predec * sa_mux_lev_1_predec;
Predec * sa_mux_lev_2_predec;
Wire * subarray_out_wire;
Driver * bl_precharge_eq_drv;
Driver * cam_bl_precharge_eq_drv;//bitline pre-charge circuit is separated for CAM and RAM arrays.
Driver * ml_precharge_drv;//matchline prechange driver
Driver * sl_precharge_eq_drv;//searchline prechage driver
Driver * sl_data_drv;//search line data driver
Driver * ml_to_ram_wl_drv;//search line data driver
powerDef power_row_decoders;
powerDef power_bit_mux_decoders;
powerDef power_sa_mux_lev_1_decoders;
powerDef power_sa_mux_lev_2_decoders;
powerDef power_fa_cam; // TODO: leakage power is not computed yet
powerDef power_bl_precharge_eq_drv;
powerDef power_subarray_out_drv;
powerDef power_cam_all_active;
powerDef power_searchline_precharge;
powerDef power_matchline_precharge;
powerDef power_ml_to_ram_wl_drv;
double delay_fa_tag, delay_cam;
double delay_before_decoder;
double delay_bitline;
double delay_wl_reset;
double delay_bl_restore;
double delay_searchline;
double delay_matchchline;
double delay_cam_sl_restore;
double delay_cam_ml_reset;
double delay_fa_ram_wl;
double delay_hit_miss_reset;
double delay_hit_miss;
Subarray subarray;
powerDef power_bitline, power_searchline, power_matchline, power_bitline_gated;
double per_bitline_read_energy;
int deg_bl_muxing;
int num_act_mats_hor_dir;
double delay_writeback;
Area cell,cam_cell;
bool is_dram,is_fa, pure_cam, camFlag;
int num_mats;
powerDef power_sa;
double delay_sa;
double leak_power_sense_amps_closed_page_state;
double leak_power_sense_amps_open_page_state;
double delay_subarray_out_drv;
double delay_subarray_out_drv_htree;
double delay_comparator;
powerDef power_comparator;
int num_do_b_mat;
int num_so_b_mat;
int num_sa_subarray;
int num_sa_subarray_search;
double C_bl;
uint32_t num_subarrays_per_mat; // the number of subarrays in a mat
uint32_t num_subarrays_per_row; // the number of subarrays in a row of a mat
double array_leakage;
double wl_leakage;
double cl_leakage;
Sleep_tx * sram_sleep_tx;
Sleep_tx * wl_sleep_tx;
Sleep_tx * cl_sleep_tx;
powerDef array_wakeup_e;
double array_wakeup_t;
double array_sleep_tx_area;
powerDef blfloating_wakeup_e;
double blfloating_wakeup_t;
double blfloating_sleep_tx_area;
powerDef wl_wakeup_e;
double wl_wakeup_t;
double wl_sleep_tx_area;
powerDef cl_wakeup_e;
double cl_wakeup_t;
double cl_sleep_tx_area;
double compute_bitline_delay(double inrisetime);
double compute_sa_delay(double inrisetime);
double compute_subarray_out_drv(double inrisetime);
private:
double compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h();
double width_write_driver_or_write_mux();
double compute_comparators_height(int tagbits, int number_ways_in_mat, double subarray_mem_cell_area_w);
double compute_cam_delay(double inrisetime);
//double compute_bitline_delay(double inrisetime);
//double compute_sa_delay(double inrisetime);
//double compute_subarray_out_drv(double inrisetime);
double compute_comparator_delay(double inrisetime);
int RWP;
int ERP;
int EWP;
int SCHP;
};
#endif

599
T1/TP/TP1/cacti_7/memcad.cc Normal file
View file

@ -0,0 +1,599 @@
#include "memcad.h"
#include <vector>
#include <list>
#include <algorithm>
#include <iostream>
#include <cmath>
#include <cassert>
using namespace std;
vector<channel_conf*> *memcad_all_channels;
vector<bob_conf*> *memcad_all_bobs;
vector<memory_conf*> *memcad_all_memories;
vector<memory_conf*> *memcad_best_results;
bool compare_channels(channel_conf* first, channel_conf* second)
{
if(first->capacity != second->capacity)
return (first->capacity < second->capacity);
MemCad_metrics first_metric = first->memcad_params->first_metric;
MemCad_metrics second_metric = first->memcad_params->second_metric;
MemCad_metrics third_metric = first->memcad_params->third_metric;
switch(first_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
switch(second_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
switch(third_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
return true;
}
void prune_channels()
{
vector<channel_conf*> * temp = new vector<channel_conf*>();
int last_added = -1;
for(unsigned int i=0;i< memcad_all_channels->size();i++)
{
if(last_added != (*memcad_all_channels)[i]->capacity)
{
temp->push_back(clone((*memcad_all_channels)[i]));
last_added = (*memcad_all_channels)[i]->capacity;
}
}
for(unsigned int i=0;i< memcad_all_channels->size();i++)
{
delete (*memcad_all_channels)[i];
}
memcad_all_channels->clear();
delete memcad_all_channels;
memcad_all_channels = temp;
}
void find_all_channels(MemCadParameters * memcad_params)
{
int DIMM_size[]={0,4,8,16,32,64};
Mem_IO_type current_io_type = memcad_params->io_type;
DIMM_Model current_dimm_model = memcad_params->dimm_model;
memcad_all_channels= new vector<channel_conf*>();
// channels can have up to 3 DIMMs per channel
// di is the capacity if i-th dimm in the channel
for(int d1=0; d1<6;d1++)
{
for(int d2=d1;d2<6;d2++)
{
for(int d3=d2;d3<6;d3++)
{
// channel capacity should not exceed the entire memory capacity.
if((DIMM_size[d1]+DIMM_size[d2]+DIMM_size[d3])>memcad_params->capacity)
continue;
if( ((current_dimm_model== JUST_LRDIMM) || (current_dimm_model== ALL))
&& ((d1==0) || (MemoryParameters::cost[current_io_type][2][d1-1]<INF))
&& ((d2==0) || (MemoryParameters::cost[current_io_type][2][d2-1]<INF))
&& ((d3==0) || (MemoryParameters::cost[current_io_type][2][d3-1]<INF)) )
{
int num_dimm_per_channel =0;
vector<int> dimm_cap;
dimm_cap.push_back(DIMM_size[d1]); if(d1>0) num_dimm_per_channel++;
dimm_cap.push_back(DIMM_size[d2]); if(d2>0) num_dimm_per_channel++;
dimm_cap.push_back(DIMM_size[d3]); if(d3>0) num_dimm_per_channel++;
int max_index = bw_index(current_io_type, MemoryParameters::bandwidth_load[current_io_type][4-num_dimm_per_channel]);
for(int bw_id=0;bw_id<=max_index; ++bw_id)
{
int bandwidth = MemoryParameters::bandwidth_load[current_io_type][bw_id];
channel_conf * new_channel = new channel_conf(memcad_params, dimm_cap, bandwidth, LRDIMM, false);
if(new_channel->cost <INF)
{
memcad_all_channels->push_back(new_channel);
}
if((DIMM_size[d1]+DIMM_size[d2]+DIMM_size[d3])==0)
continue;
if(memcad_params->low_power_permitted)
{
new_channel = new channel_conf(memcad_params, dimm_cap, bandwidth, LRDIMM, true);
if(new_channel->cost <INF)
{
memcad_all_channels->push_back(new_channel);
}
}
}
}
if( (current_dimm_model== JUST_RDIMM) || (current_dimm_model== ALL)
&& ((d1==0) || (MemoryParameters::cost[current_io_type][1][d1-1]<INF))
&& ((d2==0) || (MemoryParameters::cost[current_io_type][1][d2-1]<INF))
&& ((d3==0) || (MemoryParameters::cost[current_io_type][1][d3-1]<INF)) )
{
int num_dimm_per_channel =0;
vector<int> dimm_cap;
dimm_cap.push_back(DIMM_size[d1]); if(d1>0) num_dimm_per_channel++;
dimm_cap.push_back(DIMM_size[d2]); if(d2>0) num_dimm_per_channel++;
dimm_cap.push_back(DIMM_size[d3]); if(d3>0) num_dimm_per_channel++;
if((DIMM_size[d1]+DIMM_size[d2]+DIMM_size[d3])==0)
continue;
int max_index = bw_index(current_io_type, MemoryParameters::bandwidth_load[current_io_type][4-num_dimm_per_channel]);
for(int bw_id=0;bw_id<=max_index; ++bw_id)
{
int bandwidth = MemoryParameters::bandwidth_load[current_io_type][bw_id];
channel_conf * new_channel = new channel_conf(memcad_params, dimm_cap, bandwidth, RDIMM, false);
if(new_channel->cost <INF)
{
memcad_all_channels->push_back(new_channel);
}
if(memcad_params->low_power_permitted)
{
new_channel = new channel_conf(memcad_params, dimm_cap, bandwidth, RDIMM, true);
if(new_channel->cost <INF)
{
memcad_all_channels->push_back(new_channel);
}
}
}
}
if( (current_dimm_model== JUST_UDIMM) || (current_dimm_model== ALL)
&& ((d1==0) || (MemoryParameters::cost[current_io_type][0][d1-1]<INF))
&& ((d2==0) || (MemoryParameters::cost[current_io_type][0][d2-1]<INF))
&& ((d3==0) || (MemoryParameters::cost[current_io_type][0][d3-1]<INF)) )
{
int num_dimm_per_channel =0;
vector<int> dimm_cap;
dimm_cap.push_back(DIMM_size[d1]); if(d1>0) num_dimm_per_channel++;
dimm_cap.push_back(DIMM_size[d2]); if(d2>0) num_dimm_per_channel++;
dimm_cap.push_back(DIMM_size[d3]); if(d3>0) num_dimm_per_channel++;
if((DIMM_size[d1]+DIMM_size[d2]+DIMM_size[d3])==0)
continue;
int max_index = bw_index(current_io_type, MemoryParameters::bandwidth_load[current_io_type][4-num_dimm_per_channel]);
for(int bw_id=0;bw_id<=max_index; ++bw_id)
{
int bandwidth = MemoryParameters::bandwidth_load[current_io_type][bw_id];
channel_conf * new_channel = new channel_conf(memcad_params, dimm_cap, bandwidth, UDIMM, false);
if(new_channel->cost <INF)
{
memcad_all_channels->push_back(new_channel);
}
if(memcad_params->low_power_permitted)
{
new_channel = new channel_conf(memcad_params, dimm_cap, bandwidth, UDIMM, true);
if(new_channel->cost <INF)
{
memcad_all_channels->push_back(new_channel);
}
}
}
}
}
}
}
sort(memcad_all_channels->begin(), memcad_all_channels->end(), compare_channels);
prune_channels();
if(memcad_params->verbose)
{
for(unsigned int i=0;i<memcad_all_channels->size();i++)
{
cout << *(*memcad_all_channels)[i] << endl;
}
}
}
bool compare_channels_bw(channel_conf* first, channel_conf* second)
{
return (first->bandwidth < second->bandwidth);
}
bool compare_bobs(bob_conf* first, bob_conf* second)
{
if(first->capacity != second->capacity)
return (first->capacity < second->capacity);
MemCad_metrics first_metric = first->memcad_params->first_metric;
MemCad_metrics second_metric = first->memcad_params->second_metric;
MemCad_metrics third_metric = first->memcad_params->third_metric;
switch(first_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
switch(second_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
switch(third_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
return true;
}
void prune_bobs()
{
vector<bob_conf*> * temp = new vector<bob_conf*>();
int last_added = -1;
for(unsigned int i=0;i< memcad_all_bobs->size();i++)
{
if(last_added != (*memcad_all_bobs)[i]->capacity)
{
temp->push_back(clone((*memcad_all_bobs)[i]));
last_added = (*memcad_all_bobs)[i]->capacity;
}
}
for(unsigned int i=0;i< memcad_all_bobs->size();i++)
{
delete (*memcad_all_bobs)[i];
}
memcad_all_bobs->clear();
delete memcad_all_bobs;
memcad_all_bobs = temp;
}
void find_bobs_recursive(MemCadParameters * memcad_params,int start,int end,int nb, list<int> *channel_index)
{
if(nb==1)
{
for(int i=start; i<=end;++i)
{
channel_index->push_back(i);
vector<channel_conf*> temp;
for(list<int>::iterator it= channel_index->begin(); it!= channel_index->end(); it++)
{
int idx = *it;
temp.push_back((*memcad_all_channels)[idx]);
}
memcad_all_bobs->push_back(new bob_conf(memcad_params, &temp));
temp.clear();
channel_index->pop_back();
}
return;
}
for(int i=start;i<=end;++i)
{
channel_index->push_back(i);
find_bobs_recursive(memcad_params,i,end,nb-1,channel_index);
channel_index->pop_back();
}
}
void find_all_bobs(MemCadParameters * memcad_params)
{
memcad_all_bobs = new vector<bob_conf*>();
if(memcad_params->mirror_in_bob)
{
for(unsigned int i=0;i<memcad_all_channels->size();++i)
{
vector<channel_conf*> channels;
for(int j=0;j<memcad_params->num_channels_per_bob;j++)
channels.push_back((*memcad_all_channels)[i]);
memcad_all_bobs->push_back(new bob_conf(memcad_params, &channels));
channels.clear();
}
}
else if(memcad_params->same_bw_in_bob)
{
sort(memcad_all_channels->begin(), memcad_all_channels->end(), compare_channels_bw);
vector<int> start_index; start_index.push_back(0);
vector<int> end_index;
int last_bw =(*memcad_all_channels)[0]->bandwidth;
for(unsigned int i=0;i< memcad_all_channels->size();i++)
{
if(last_bw!=(*memcad_all_channels)[i]->bandwidth)
{
end_index.push_back(i-1);
start_index.push_back(i);
last_bw = (*memcad_all_channels)[i]->bandwidth;
}
}
end_index.push_back(memcad_all_channels->size()-1);
list<int> channel_index;
for(unsigned int i=0;i< start_index.size();++i)
{
find_bobs_recursive(memcad_params,start_index[i],end_index[i],memcad_params->num_channels_per_bob, &channel_index);
}
}
else
{
cout << "We do not support different frequencies per in a BoB!" << endl;
assert(false);
}
sort(memcad_all_bobs->begin(), memcad_all_bobs->end(), compare_bobs);
prune_bobs();
if(memcad_params->verbose)
{
for(unsigned int i=0;i<memcad_all_bobs->size();i++)
{
cout << *(*memcad_all_bobs)[i] << endl;
}
}
}
void find_mems_recursive(MemCadParameters * memcad_params, int remaining_capacity, int start, int nb, list<int>* bobs_index)
{
if(nb==1)
{
for(unsigned int i=start; i< memcad_all_bobs->size();++i)
{
if((*memcad_all_bobs)[i]->capacity != remaining_capacity)
continue;
bobs_index->push_back(i);
vector<bob_conf*> temp;
for(list<int>::iterator it= bobs_index->begin(); it!= bobs_index->end(); it++)
{
int index = *it;
temp.push_back((*memcad_all_bobs)[index]);
}
memcad_all_memories->push_back(new memory_conf(memcad_params, &temp));
temp.clear();
bobs_index->pop_back();
}
return;
}
for(unsigned int i=start; i<memcad_all_bobs->size();i++)
{
if((*memcad_all_bobs)[i]->capacity > remaining_capacity)
continue;
int new_remaining_capacity = remaining_capacity-(*memcad_all_bobs)[i]->capacity;
bobs_index->push_back(i);
find_mems_recursive(memcad_params, new_remaining_capacity, i, nb-1, bobs_index);
bobs_index->pop_back();
}
}
//void find_mems_recursive(MemCadParameters * memcad_params, int start, int
bool compare_memories(memory_conf* first, memory_conf* second)
{
if(first->capacity != second->capacity)
return (first->capacity < second->capacity);
MemCad_metrics first_metric = first->memcad_params->first_metric;
MemCad_metrics second_metric = first->memcad_params->second_metric;
MemCad_metrics third_metric = first->memcad_params->third_metric;
switch(first_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
switch(second_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
switch(third_metric)
{
case(Cost):
if(first->cost != second->cost)
return (first->cost < second->cost);
break;
case(Bandwidth):
if(first->bandwidth != second->bandwidth)
return (first->bandwidth > second->bandwidth);
break;
case(Energy):
if( fabs(first->energy_per_access - second->energy_per_access)>EPS)
return (first->energy_per_access < second->energy_per_access);
break;
default:
assert(false);
}
return true;
}
bool find_all_memories(MemCadParameters * memcad_params)
{
memcad_all_memories = new vector<memory_conf*>();
list<int> bobs_index;
find_mems_recursive(memcad_params, memcad_params->capacity, 0,memcad_params->num_bobs, &bobs_index);
sort(memcad_all_memories->begin(), memcad_all_memories->end(), compare_memories);
if(memcad_params->verbose)
{
cout << "all possible results:" << endl;
for(unsigned int i=0;i<memcad_all_memories->size();i++)
{
cout << *(*memcad_all_memories)[i] << endl;
}
}
if(memcad_all_memories->size()==0)
{
cout << "No result found " << endl;
return false;
}
cout << "top 3 best memory configurations are:" << endl;
int min_num_results = (memcad_all_memories->size()>3?3:memcad_all_memories->size());
for(int i=0;i<min_num_results;++i)
{
if((*memcad_all_memories)[i])
cout << *(*memcad_all_memories)[i] << endl;
}
return true;
}
void clean_results()
{
for(unsigned int i=0;i<memcad_all_channels->size();++i)
{
delete (*memcad_all_channels)[i];
}
delete memcad_all_channels;
for(unsigned int i=0;i<memcad_all_bobs->size();++i)
{
delete (*memcad_all_bobs)[i];
}
delete memcad_all_bobs;
for(unsigned int i=0;i<memcad_all_memories->size();++i)
{
delete (*memcad_all_memories)[i];
}
delete memcad_all_memories;
}
void solve_memcad(MemCadParameters * memcad_params)
{
find_all_channels(memcad_params);
find_all_bobs(memcad_params);
find_all_memories(memcad_params);
clean_results();
}

View file

@ -0,0 +1,30 @@
#ifndef __MEMCAD_H__
#define __MEMCAD_H__
#include "memcad_parameters.h"
#include <vector>
extern vector<channel_conf*> *memcad_all_channels;
extern vector<bob_conf*> *memcad_all_bobs;
extern vector<memory_conf*> *memcad_all_memories;
extern vector<memory_conf*> *memcad_best_results;
void find_all_channels(MemCadParameters * memcad_params);
void find_all_bobs(MemCadParameters * memcad_params);
bool find_all_memories(MemCadParameters * memcad_params);
void clean_results();
void solve_memcad(MemCadParameters * memcad_params);
#endif

View file

@ -0,0 +1,466 @@
#include "memcad_parameters.h"
#include <cmath>
#include <cassert>
MemCadParameters::MemCadParameters(InputParameter * g_ip)
{
// default value
io_type=DDR4; // DDR3 vs. DDR4
capacity=400; // in GB
num_bobs=4; // default=4me
num_channels_per_bob=2; // 1 means no bob
capacity_wise=true; // true means the load on each channel is proportional to its capacity.
first_metric=Cost;
second_metric=Bandwidth;
third_metric=Energy;
dimm_model=ALL;
low_power_permitted=false;
load=0.9; // between 0 to 1
row_buffer_hit_rate=1;
rd_2_wr_ratio=2;
same_bw_in_bob=true; // true if all the channels in the bob have the same bandwidth
mirror_in_bob=true;// true if all the channels in the bob have the same configs
total_power=false; // false means just considering I/O Power.
verbose=false;
// values for input
io_type=g_ip->io_type;
capacity=g_ip->capacity;
num_bobs=g_ip->num_bobs;
num_channels_per_bob=g_ip->num_channels_per_bob;
first_metric=g_ip->first_metric;
second_metric=g_ip->second_metric;
third_metric=g_ip->third_metric;
dimm_model=g_ip->dimm_model;
///low_power_permitted=g_ip->low_power_permitted;
///load=g_ip->load;
///row_buffer_hit_rate=g_ip->row_buffer_hit_rate;
///rd_2_wr_ratio=g_ip->rd_2_wr_ratio;
///same_bw_in_bob=g_ip->same_bw_in_bob;
mirror_in_bob=g_ip->mirror_in_bob;
///total_power=g_ip->total_power;
verbose=g_ip->verbose;
}
void MemCadParameters::print_inputs()
{
}
bool MemCadParameters::sanity_check()
{
return true;
}
double MemoryParameters::VDD[2][2][4]= //[lp:hp][ddr3:ddr4][frequency index]
{
{
{1.5,1.5,1.5,1.5},
{1.2,1.2,1.2,1.2}
},
{
{1.35,1.35,1.35,1.35},
{1.0,1.0,1.0,1.0}
}
};
double MemoryParameters::IDD0[2][4]=
{
{55,60,65,75},
{58,58,60,64}
};
double MemoryParameters::IDD2P0[2][4]=
{
{20,20,20,20},
{20,20,20,20}
};
double MemoryParameters::IDD2P1[2][4]=
{
{30,30,32,37},
{30,30,30,32}
};
double MemoryParameters::IDD2N[2][4]=
{
{40,42,45,50},
{44,44,46,50}
};
double MemoryParameters::IDD3P[2][4]=
{
{45,50,55,60},
{44,44,44,44}
};
double MemoryParameters::IDD3N[2][4]=
{
{42,47,52,57},
{44,44,44,44}
};
double MemoryParameters::IDD4R[2][4]=
{
{120,135,155,175},
{140,140,150,160}
};
double MemoryParameters::IDD4W[2][4]=
{
{100,125,145,165},
{156,156,176,196}
};
double MemoryParameters::IDD5[2][4]=
{
{150,205,210,220},
{190,190,190,192}
};
double MemoryParameters::io_energy_read[2][3][3][4] =// [ddr3:ddr4][udimm:rdimm:lrdimm][load 1:2:3][frequency 0:1:2:3]
{
{ //ddr3
{//udimm
{2592.33, 2593.33, 3288.784, 4348.612},
{2638.23, 2640.23, 3941.584, 5415.492},
{2978.659, 2981.659, 4816.644, 6964.162}
},
{//rdimm
{2592.33, 3087.071, 3865.044, 4844.982},
{2932.759, 3733.318, 4237.634, 5415.492},
{3572.509, 4603.109, 5300.004, 6964.162}
},
{//lrdimm
{4628.966, 6357.625, 7079.348, 9680.454},
{5368.26, 6418.788, 7428.058, 10057.164},
{5708.689, 7065.038, 7808.678, 10627.674}
}
},
{ //ddr
{//udimm
{2135.906, 2633.317, 2750.919, 2869.406},
{2458.714, 2695.791, 2822.298, 3211.111},
{2622.85, 3030.048, 3160.265, 3534.448}
},
{//rdimm
{2135.906, 2633.317, 2750.919, 2869.406},
{2458.714, 2695.791, 3088.886, 3211.111},
{2622.85, 3030.048, 3312.468, 3758.445}
},
{//lrdimm
{4226.903, 5015.342, 5490.61, 5979.864},
{4280.471, 5319.132, 5668.945, 6060.216},
{4603.279, 5381.605, 5740.325, 6401.926}
}
}
};
double MemoryParameters::io_energy_write[2][3][3][4] =
{
{ //ddr3
{//udimm
{2758.951, 2984.854, 3571.804, 4838.902},
{2804.851, 3768.524, 4352.214, 5580.362},
{3213.897, 3829.684, 5425.854, 6933.512}
},
{//rdimm
{2758.951, 3346.104, 3931.154, 4838.902},
{3167.997, 4114.754, 4696.724, 5580.362},
{3561.831, 3829.684, 6039.994, 8075.542}
},
{//lrdimm
{4872.238, 5374.314, 7013.868, 9267.574},
{5701.502, 6214.348, 7449.758, 10045.004},
{5747.402, 6998.018, 8230.168, 10786.464}
}
},
{ //ddr4
{//udimm
{2525.129, 2840.853, 2979.037, 3293.608},
{2933.756, 3080.126, 3226.497, 3979.698},
{3293.964, 3753.37, 3906.137, 4312.448}
},
{//rdimm
{2525.129, 2840.853, 3155.117, 3293.608},
{2933.756, 3080.126, 3834.757, 3979.698},
{3293.964, 3753.37, 4413.037, 5358.078}
},
{//lrdimm
{4816.453, 5692.314, 5996.134, 6652.936},
{4870.021, 5754.788, 6067.514, 6908.636},
{5298.373, 5994.07, 6491.054, 7594.726}
}
}
};
double MemoryParameters::T_RAS[2] = {35,35};
double MemoryParameters::T_RC[2] = {47.5,47.5};
double MemoryParameters::T_RP[2] = {13,13};
double MemoryParameters::T_RFC[2] = {340,260};
double MemoryParameters::T_REFI[2] = {7800,7800};
int MemoryParameters::bandwidth_load[2][4]={{400,533,667,800},{800,933,1066,1200}};
double MemoryParameters::cost[2][3][5] =
{
{
{40.38,76.13,INF,INF,INF},
{42.24,64.17,122.6,304.3,INF},
{INF,INF,211.3,287.5,1079.5}
},
{
{25.99,45.99,INF,INF,INF},
{32.99,60.45,126,296.3,INF},
{INF,INF,278.99,333,1474}
}
};
///////////////////////////////////////////////////////////////////////////////////
double calculate_power(double load, double row_buffer_hr, double rd_wr_ratio, int chips_per_rank, int frequency_index, int lp)
{
return 0;
}
int bw_index(Mem_IO_type type, int bandwidth)
{
if(type==DDR3)
{
if(bandwidth<=400)
return 0;
else if(bandwidth <= 533)
return 1;
else if(bandwidth <= 667)
return 2;
else
return 3;
}
else
{
if(bandwidth<=800)
return 0;
else if(bandwidth <= 933)
return 1;
else if(bandwidth <= 1066)
return 2;
else
return 3;
}
return 0;
}
channel_conf::channel_conf(MemCadParameters * memcad_params, const vector<int>& dimm_cap, int bandwidth, Mem_DIMM type, bool low_power)
:memcad_params(memcad_params),type(type),low_power(low_power),bandwidth(bandwidth),latency(0),valid(true)
{
//assert(memcad_params);
assert(dimm_cap.size() <=DIMM_PER_CHANNEL);
assert(memcad_params->io_type<2); // So far, we just support DDR3 and DDR4.
// upading capacity
num_dimm_per_channel=0;
capacity =0;
for(int i=0;i<5;i++) histogram_capacity[i]=0;
for(unsigned int i=0;i<dimm_cap.size();i++)
{
if(dimm_cap[i]==0)
continue;
int index =(int)(log2(dimm_cap[i]+0.1))-2;
assert(index<5);
histogram_capacity[index]++;
num_dimm_per_channel++;
capacity += dimm_cap[i];
}
// updating bandwidth
if(capacity>0)
bandwidth =0;
//bandwidth = MemoryParameters::bandwidth_load[memcad_params->io_type][4-num_dimm_per_channel];
// updating channel cost
cost =0;
for(int i=0;i<5;++i)
cost += histogram_capacity[i] * MemoryParameters::cost[memcad_params->io_type][type][i];
// update energy
calc_power();
}
void channel_conf::calc_power()
{
double read_ratio = memcad_params->rd_2_wr_ratio/(1.0+memcad_params->rd_2_wr_ratio);
double write_ratio = 1.0/(1.0+memcad_params->rd_2_wr_ratio);
Mem_IO_type current_io_type = memcad_params->io_type;
double capacity_ratio = (capacity/(double) memcad_params->capacity );
double T_BURST = 4; // memory cycles
energy_per_read = MemoryParameters::io_energy_read[current_io_type][type][num_dimm_per_channel-1][bw_index(current_io_type,bandwidth)];
energy_per_read /= (bandwidth/T_BURST);
energy_per_write = MemoryParameters::io_energy_write[current_io_type][type][num_dimm_per_channel-1][bw_index(current_io_type,bandwidth)];
energy_per_write /= (bandwidth/T_BURST);
if(memcad_params->capacity_wise)
{
energy_per_read *= capacity_ratio;
energy_per_write *= capacity_ratio;
}
energy_per_access = read_ratio* energy_per_read + write_ratio*energy_per_write;
}
channel_conf* clone(channel_conf* origin)
{
vector<int> temp;
int size =4;
for(int i=0;i<5;++i)
{
for(int j=0;j<origin->histogram_capacity[i];++j)
{
temp.push_back(size);
}
size *=2;
}
channel_conf * new_channel = new channel_conf(origin->memcad_params,temp,origin->bandwidth, origin->type,origin->low_power);
return new_channel;
}
ostream& operator<<(ostream &os, const channel_conf& ch_cnf)
{
os << "cap: " << ch_cnf.capacity << " GB ";
os << "bw: " << ch_cnf.bandwidth << " (MHz) ";
os << "cost: $" << ch_cnf.cost << " ";
os << "dpc: " << ch_cnf.num_dimm_per_channel << " ";
os << "energy: " << ch_cnf.energy_per_access << " (nJ) ";
os << " DIMM: " << ((ch_cnf.type==UDIMM)?" UDIMM ":((ch_cnf.type==RDIMM)?" RDIMM ":"LRDIMM "));
os << " low power: " << ((ch_cnf.low_power)? "T ":"F ");
os << "[ ";
for(int i=0;i<5;i++)
os << ch_cnf.histogram_capacity[i] << "(" << (1<<(i+2)) << "GB) ";
os << "]";
return os;
}
bob_conf::bob_conf(MemCadParameters * memcad_params, vector<channel_conf*> * in_channels)
:memcad_params(memcad_params),num_channels(0),capacity(0),bandwidth(0)
,energy_per_read(0),energy_per_write(0),energy_per_access(0),cost(0),latency(0),valid(true)
{
assert(in_channels->size() <= MAX_NUM_CHANNELS_PER_BOB);
for(int i=0;i<MAX_NUM_CHANNELS_PER_BOB;i++)
channels[i]=0;
for(unsigned int i=0;i< in_channels->size();++i)
{
channels[i] = (*in_channels)[i];
num_channels++;
capacity += (*in_channels)[i]->capacity;
cost += (*in_channels)[i]->cost;
bandwidth += (*in_channels)[i]->bandwidth;
energy_per_read += (*in_channels)[i]->energy_per_read;
energy_per_write += (*in_channels)[i]->energy_per_write;
energy_per_access += (*in_channels)[i]->energy_per_access;
}
}
bob_conf* clone(bob_conf* origin)
{
vector<channel_conf*> temp;
for(int i=0;i<MAX_NUM_CHANNELS_PER_BOB;++i)
{
if( (origin->channels)[i]==0 )
break;
temp.push_back( (origin->channels)[i] );
}
bob_conf * new_bob = new bob_conf(origin->memcad_params,&temp);
return new_bob;
}
ostream & operator <<(ostream &os, const bob_conf& bob_cnf)
{
os << " " << "BoB " ;
os << "cap: " << bob_cnf.capacity << " GB ";
os << "num_channels: " << bob_cnf.num_channels << " ";
os << "bw: " << bob_cnf.bandwidth << " (MHz) ";
os << "cost: $" << bob_cnf.cost << " ";
os << "energy: " << bob_cnf.energy_per_access << " (nJ) ";
os << endl;
os << " " << " ==============" << endl;
for(int i=0;i<bob_cnf.num_channels;i++)
{
channel_conf * temp = bob_cnf.channels[i];
os << " (" << i << ") " << (*temp) << endl ;
}
os << " " << " =============="<< endl;
return os;
}
memory_conf::memory_conf(MemCadParameters * memcad_params, vector<bob_conf*> * in_bobs)
:memcad_params(memcad_params),num_bobs(0),capacity(0),bandwidth(0)
,energy_per_read(0),energy_per_write(0),energy_per_access(0),cost(0),latency(0),valid(true)
{
assert(in_bobs->size() <= MAX_NUM_BOBS);
for(int i=0;i<MAX_NUM_BOBS;i++)
bobs[i]=0;
for(unsigned int i=0;i< in_bobs->size();++i)
{
bobs[i] = (*in_bobs)[i];
num_bobs++;
capacity += (*in_bobs)[i]->capacity;
cost += (*in_bobs)[i]->cost;
bandwidth += (*in_bobs)[i]->bandwidth;
energy_per_read += (*in_bobs)[i]->energy_per_read;
energy_per_write += (*in_bobs)[i]->energy_per_write;
energy_per_access += (*in_bobs)[i]->energy_per_access;
}
}
ostream & operator <<(ostream &os, const memory_conf& mem_cnf)
{
os << "Memory " ;
os << "cap: " << mem_cnf.capacity << " GB ";
os << "num_bobs: " << mem_cnf.num_bobs << " ";
os << "bw: " << mem_cnf.bandwidth << " (MHz) ";
os << "cost: $" << mem_cnf.cost << " ";
os << "energy: " << mem_cnf.energy_per_access << " (nJ) ";
os << endl;
os << " {" << endl;
for(int i=0;i<mem_cnf.num_bobs;i++)
{
bob_conf * temp = mem_cnf.bobs[i];
os<< " (" << i <<") " <<(*temp) << endl ;
}
os << " }"<< endl;
return os;
}

View file

@ -0,0 +1,251 @@
#ifndef __MEMCAD_PARAMS_H__
#define __MEMCAD_PARAMS_H__
#include <vector>
#include <iostream>
#include "cacti_interface.h"
#include "const.h"
#include "parameter.h"
using namespace std;
///#define INF 1000000
#define EPS 0.0000001
#define MAX_DIMM_PER_CHANNEL 3
#define MAX_CAP_PER_DIMM 64
#define MAX_RANKS_PER_DIMM 4
#define MIN_BW_PER_CHANNEL 400
#define MAX_DDR3_CHANNEL_BW 800
#define MAX_DDR4_CHANNEL_BW 1600
#define MAX_NUM_CHANNELS_PER_BOB 2
#define MAX_NUM_BOBS 6
#define DIMM_PER_CHANNEL 3
/*
enum Mem_IO_type
{
DDR3,
DDR4,
LPDDR2,
WideIO,
Low_Swing_Diff,
Serial
};
enum Mem_DIMM
{
UDIMM,
RDIMM,
LRDIMM
};
*/
class MemCadParameters
{
public:
Mem_IO_type io_type; // DDR3 vs. DDR4
int capacity; // in GB
int num_bobs; // default=4me
///int bw_per_channel; // defaul=1600 MHz;
///bool with_bob;
int num_channels_per_bob; // 1 means no bob
bool capacity_wise; // true means the load on each channel is proportional to its capacity.
///int min_bandwith;
MemCad_metrics first_metric;
MemCad_metrics second_metric;
MemCad_metrics third_metric;
DIMM_Model dimm_model;
bool low_power_permitted; // Not yet implemented. It determines acceptable VDDs.
double load; // between 0 to 1
double row_buffer_hit_rate;
double rd_2_wr_ratio;
bool same_bw_in_bob; // true if all the channels in the bob have the same bandwidth.
bool mirror_in_bob;// true if all the channels in the bob have the same configs
bool total_power; // false means just considering I/O Power
bool verbose;
// Functions
MemCadParameters(InputParameter * g_ip);
void print_inputs();
bool sanity_check();
};
//////////////////////////////////////////////////////////////////////////////////
class MemoryParameters
{
public:
// Power Parameteres
static double VDD[2][2][4];
static double IDD0[2][4];
static double IDD1[2][4];
static double IDD2P0[2][4];
static double IDD2P1[2][4];
static double IDD2N[2][4];
static double IDD3P[2][4];
static double IDD3N[2][4];
static double IDD4R[2][4];
static double IDD4W[2][4];
static double IDD5[2][4];
static double io_energy_read[2][3][3][4];
static double io_energy_write[2][3][3][4];
// Timing Parameters
static double T_RAS[2];
static double T_RC[2];
static double T_RP[2];
static double T_RFC[2];
static double T_REFI[2];
// Bandwidth Parameters
static int bandwidth_load[2][4];
// Cost Parameters
static double cost[2][3][5];
// Functions
MemoryParameters();
int bw_index(Mem_IO_type type, int bandwidth);
};
///////////////////////////////////////////////////////////////////////////
int bw_index(Mem_IO_type type, int bandwidth);
///////////////////////////////////////////////////////////////////////////
class channel_conf
{
public:
MemCadParameters *memcad_params;
Mem_DIMM type;
int num_dimm_per_channel;
int histogram_capacity[5]; // 0->4GB, 1->8GB, 2->16GB, 3->32GB, 4->64GB
bool low_power;
int capacity;
int bandwidth;
double energy_per_read;
double energy_per_write;
double energy_per_access;
double cost;
double latency;
bool valid;
// Functions
channel_conf(MemCadParameters * memcad_params, const vector<int>& dimm_cap, int bandwidth, Mem_DIMM type, bool low_power);
void calc_power();
friend channel_conf* clone(channel_conf*);
friend ostream & operator<<(ostream &os, const channel_conf& ch_cnf);
};
///////////////////////////////////////////////////////////////////////////
class bob_conf
{
public:
MemCadParameters *memcad_params;
int num_channels;
channel_conf *channels[MAX_NUM_CHANNELS_PER_BOB];
int capacity;
int bandwidth;
double energy_per_read;
double energy_per_write;
double energy_per_access;
double cost;
double latency;
bool valid;
bob_conf(MemCadParameters * memcad_params, vector<channel_conf*> * channels);
friend bob_conf* clone(bob_conf*);
friend ostream & operator <<(ostream &os, const bob_conf& bob_cnf);
};
///////////////////////////////////////////////////////////////////////////
class memory_conf
{
public:
MemCadParameters *memcad_params;
int num_bobs;
bob_conf* bobs[MAX_NUM_BOBS];
int capacity;
int bandwidth;
double energy_per_read;
double energy_per_write;
double energy_per_access;
double cost;
double latency;
bool valid;
memory_conf(MemCadParameters * memcad_params, vector<bob_conf*> * bobs);
friend ostream & operator <<(ostream &os, const memory_conf& bob_cnf);
};
#endif

View file

@ -0,0 +1,741 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "memorybus.h"
#include "wire.h"
#include <assert.h>
#include <iostream>
#include <cmath>
Memorybus::Memorybus(
enum Wire_type wire_model, double mat_w, double mat_h, double subarray_w_, double subarray_h_,
int _row_add_bits, int _col_add_bits, int _data_bits, int _ndbl, int _ndwl, /*enum Htree_type htree_type,*/
enum Memorybus_type membus_type_, const DynamicParameter & dp_,
/*TechnologyParameter::*/DeviceType *dt):
dp(dp_),
in_rise_time(0), out_rise_time(0),
is_dram(dp.is_dram),
membus_type(membus_type_),
mat_width(mat_w), mat_height(mat_h), subarray_width(subarray_w_), subarray_height(subarray_h_),
data_bits(_data_bits), ndbl(_ndbl), ndwl(_ndwl),
wt(wire_model), deviceType(dt)
{
if (g_ip->print_detail_debug)
cout << "memorybus.cc: membus_type = " << membus_type << endl;
power.readOp.dynamic = 0;
power.readOp.leakage = 0;
power.readOp.gate_leakage = 0;
power.searchOp.dynamic =0;
delay = 0;
cell.h = g_tp.dram.b_h;
cell.w = g_tp.dram.b_w;
if (!g_ip->is_3d_mem)
assert(ndbl >= 2 && ndwl >= 2);
if (g_ip->print_detail_debug)
{
cout << "burst length: " << g_ip->burst_depth <<endl;
cout << "output width: " << g_ip->io_width <<endl;
}
//Default value
chip_IO_width = g_ip->io_width; //g_ip->out_w; //x4, x8, x16 chip
burst_length = g_ip->burst_depth; //g_ip->burst_len; //DDR2 4, DDR3 8
data_bits = chip_IO_width * burst_length;
row_add_bits = _row_add_bits;
col_add_bits = _col_add_bits;
max_unpipelined_link_delay = 0; //TODO
min_w_nmos = g_tp.min_w_nmos_;
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
semi_repeated_global_line = 0; // 1: semi-repeated global line, repeaters in decoder stripes; 0: Non-repeated global line, slower
ndwl = _ndwl/ g_ip->num_tier_row_sprd;
ndbl = _ndbl/ g_ip->num_tier_col_sprd;
num_subarray_global_IO = ndbl>16?16:ndbl;
switch (membus_type)
{
case Data_path:
data_bits = chip_IO_width * burst_length;
Network();
break;
case Row_add_path:
add_bits = _row_add_bits;
num_dec_signals = dp.num_r_subarray * ndbl;
Network();
break;
case Col_add_path:
add_bits = _col_add_bits;
num_dec_signals = dp.num_c_subarray * ndwl / data_bits;
Network();
break;
default:
assert(0);
break;
}
assert(power.readOp.dynamic >= 0);
assert(power.readOp.leakage >= 0);
}
Memorybus::~Memorybus()
{
delete center_stripe;
delete bank_bus;
switch (membus_type)
{
case Data_path:
delete local_data;
delete global_data;
delete local_data_drv;
if(semi_repeated_global_line)
delete global_data_drv;
delete out_seg;
break;
case Row_add_path:
delete global_WL;
delete add_predec;
delete add_dec;
delete lwl_drv;
break;
case Col_add_path:
delete column_sel;
delete add_predec;
delete add_dec;
break;
default:
assert(0);
break;
}
}
// ---For 3D DRAM, the bank height and length is reduced to 1/num_tier_row_sprd and 1/num_tier_col_sprd.
// ---As a result, ndwl and ndbl are also reduced to the same ratio, but he number of banks increase to the product of these two parameters
void Memorybus::Network()
{
//double POLY_RESISTIVITY = 0.148; //ohm-micron
double R_wire_dec_out = 0;
double C_ld_dec_out = 0;
double bank_bus_length = 0;
double area_bank_vertical_peripheral_circuitry = 0, area_bank_horizontal_peripheral_circuitry = 0;
area_sense_amp = (mat_height - subarray_height) * mat_width * ndbl * ndwl;
area_subarray = subarray_height * subarray_width * ndbl * ndwl;
// ---Because in 3D DRAM mat only has one subarray, but contains the subarray peripheral circuits such as SA. Detail see mat.cc is_3d_mem part.
subarray_height = mat_height;
subarray_width = mat_width;
if(g_ip->partition_gran == 0)// Coarse_rank_level: add/data bus around
{
height_bank = subarray_height * ndbl + (col_add_bits + row_add_bits)*g_tp.wire_outside_mat.pitch/2 + data_bits*g_tp.wire_outside_mat.pitch;
length_bank = subarray_width * ndwl + (col_add_bits + row_add_bits)*g_tp.wire_outside_mat.pitch/2 + data_bits*g_tp.wire_outside_mat.pitch;
area_address_bus = (row_add_bits + col_add_bits) *g_tp.wire_outside_mat.pitch * sqrt(length_bank * height_bank);
area_data_bus = data_bits *g_tp.wire_outside_mat.pitch * sqrt(length_bank * height_bank);
}
else if(g_ip->partition_gran == 1)//Fine_rank_level: add bus replaced by TSVs
{
height_bank = subarray_height * ndbl;
length_bank = subarray_width * ndwl;
area_address_bus = 0;
area_data_bus = data_bits *g_tp.wire_outside_mat.pitch * sqrt(length_bank * height_bank);
}
else if(g_ip->partition_gran == 2)//Coarse_bank_level: add/data bus replaced by TSVs
{
height_bank = subarray_height * ndbl;
length_bank = subarray_width * ndwl;
area_address_bus = 0;
area_data_bus = 0;
}
if (g_ip->print_detail_debug)
{
cout << "memorybus.cc: N subarrays per mat = " << dp.num_subarrays / dp.num_mats << endl;
cout << "memorybus.cc: g_tp.wire_local.pitch = " << g_tp.wire_local.pitch /1e3 << " mm" << endl;
cout << "memorybus.cc: subarray_width = " << subarray_width /1e3 << " mm" << endl;
cout << "memorybus.cc: subarray_height = " << subarray_height /1e3 << " mm" << endl;
cout << "memorybus.cc: mat_height = " << mat_height /1e3 << " mm" << endl;
cout << "memorybus.cc: mat_width = " << mat_width /1e3 << " mm" << endl;
cout << "memorybus.cc: height_bank = " << height_bank /1e3 << " mm" << endl;
cout << "memorybus.cc: length_bank = " << length_bank /1e3 << " mm" << endl;
}
int num_banks_hor_dir = 1 << (int)ceil((double)_log2( g_ip->nbanks * g_ip->num_tier_row_sprd )/2 ) ;
int num_banks_ver_dir = 1 << (int)ceil((double)_log2( g_ip->nbanks * g_ip->num_tier_col_sprd * g_ip->num_tier_row_sprd /num_banks_hor_dir ) );
if (g_ip->print_detail_debug)
{
cout<<"horz bank #: "<<num_banks_hor_dir<<endl;
cout<<"vert bank #: "<<num_banks_ver_dir<<endl;
cout << "memorybus.cc: g_ip->nbanks = " << g_ip->nbanks << endl;
cout << "memorybus.cc: num_banks_hor_dir = " << num_banks_hor_dir << endl;
}
// ************************************* Wire Interconnections *****************************************
double center_stripe_length = 0.5 * double(num_banks_hor_dir) * height_bank;
if(g_ip->print_detail_debug)
{
cout << "memorybus.cc: center_stripe wire length = " << center_stripe_length << " um"<< endl;
}
center_stripe = new Wire(wt, center_stripe_length);
area_bus = 2.0 * center_stripe_length * (row_add_bits + col_add_bits + data_bits) *g_tp.wire_outside_mat.pitch / g_ip->nbanks;
//if (g_ip->partition_gran == 0)
//area_bus = (row_add_bits + col_add_bits) *g_tp.wire_outside_mat.pitch * center_stripe_length;
if (membus_type == Row_add_path)
{
int num_lwl_per_gwl = 4;
global_WL = new Wire(wt, length_bank, 1, 1, 1, inside_mat, CU_RESISTIVITY, &(g_tp.peri_global));
//local_WL = new Wire(wt, length_bank/num_lwl_drv, local_wires, POLY_RESISTIVITY, &(g_tp.dram_wl));
num_lwl_drv = ndwl;
//C_GWL = num_lwl_drv * gate_C(g_tp.min_w_nmos_+min_w_pmos,0) + c_w_metal * dp.num_c_subarray * ndwl;
if(semi_repeated_global_line)
{
C_GWL = (double)num_lwl_per_gwl * gate_C(g_tp.min_w_nmos_+min_w_pmos,0) + g_tp.wire_inside_mat.C_per_um * (subarray_width + g_tp.wire_local.pitch);
R_GWL = g_tp.wire_inside_mat.R_per_um * (subarray_width + g_tp.wire_local.pitch);
}
else
{
C_GWL = (double)num_lwl_drv * num_lwl_per_gwl * gate_C(g_tp.min_w_nmos_+min_w_pmos,0) + g_tp.wire_inside_mat.C_per_um * length_bank;
R_GWL = length_bank * g_tp.wire_inside_mat.R_per_um;
}
lwl_driver_c_gate_load = dp.num_c_subarray * gate_C_pass(g_tp.dram.cell_a_w, g_tp.dram.b_w, true, true);
//lwl_driver_c_wire_load = subarray_width * g_tp.wire_local.C_per_um;
//lwl_driver_r_wire_load = subarray_width * g_tp.wire_local.R_per_um;
if (g_ip->print_detail_debug)
{
cout<<"C_GWL: "<<C_GWL<<endl;
cout<<"num_lwl_drv: "<<num_lwl_drv<<endl;
cout<<"g_tp.wire_inside_mat.C_per_um: "<<g_tp.wire_inside_mat.C_per_um<<endl;
cout<<"length_bank: "<<length_bank<<endl;
cout << "memorybus.cc: lwl single gate capacitance = " << gate_C_pass(g_tp.dram.cell_a_w, g_tp.dram.b_w, true, true) << endl;
cout << "memorybus.cc: lwl wire capacitance per single wire = " << g_tp.wire_local.C_per_um << endl;
cout << "memorybus.cc: dp.num_c_subarray = " << dp.num_c_subarray << endl;
cout << "memorybus.cc: dram.b_w = " << g_tp.dram.b_w << endl;
}
lwl_driver_c_wire_load = dp.num_c_subarray * g_tp.dram.b_w * g_tp.wire_local.C_per_um;
lwl_driver_r_wire_load = dp.num_c_subarray * g_tp.dram.b_w * g_tp.wire_local.R_per_um;
C_LWL = lwl_driver_c_gate_load + lwl_driver_c_wire_load;
lwl_drv = new Driver(
lwl_driver_c_gate_load,
lwl_driver_c_wire_load,
lwl_driver_r_wire_load,
is_dram);
lwl_drv->compute_area();
if(!g_ip->fine_gran_bank_lvl)
{
C_ld_dec_out = C_GWL;
R_wire_dec_out = R_GWL;
}
else
{
C_ld_dec_out = gate_C(g_tp.min_w_nmos_+min_w_pmos,0);
R_wire_dec_out = 0;
}
if (g_ip->print_detail_debug)
cout << "memorybus.cc: ndwl * dp.num_c_subarray * g_tp.dram.b_w = " << ndwl * dp.num_c_subarray * g_tp.dram.b_w << endl;
//bank_bus_length = double(num_banks_ver_dir) * 0.5 * (height_bank + 0.5*double(row_add_bits+col_add_bits+data_bits)*g_tp.wire_outside_mat.pitch);
bank_bus_length = double(num_banks_ver_dir) * 0.5 * MAX(length_bank, height_bank);
bank_bus = new Wire(wt, bank_bus_length);
}
else if (membus_type == Col_add_path)
{
column_sel = new Wire(wt, sqrt(length_bank * height_bank), 1, 1, 1, outside_mat, CU_RESISTIVITY, &(g_tp.peri_global));
if(semi_repeated_global_line)
{
C_colsel = g_tp.wire_inside_mat.C_per_um * (subarray_height + g_tp.wire_local.pitch) ;
R_colsel = g_tp.wire_inside_mat.R_per_um * (subarray_height + g_tp.wire_local.pitch);
}
else
{
C_colsel = column_sel->repeater_size * gate_C(g_tp.min_w_nmos_+min_w_pmos,0)
+ (column_sel->repeater_spacing < height_bank ? column_sel->repeater_spacing : height_bank) * g_tp.wire_outside_mat.C_per_um;
R_colsel = (column_sel->repeater_spacing < height_bank ? column_sel->repeater_spacing : height_bank) * g_tp.wire_outside_mat.R_per_um;
}
if(!g_ip->fine_gran_bank_lvl)
{
C_ld_dec_out = C_colsel;
//+ (int)(column_sel->repeater_spacing/height_bank) * ndbl*dp.num_r_subarray* gate_C(g_tp.w_nmos_sa_mux,0);
R_wire_dec_out = R_colsel;
}
else
{
C_ld_dec_out = gate_C(g_tp.min_w_nmos_+min_w_pmos,0);
R_wire_dec_out = 0;
}
if (g_ip->print_detail_debug)
cout << "memorybus.cc: column_sel->repeater_size = " << column_sel->repeater_size << endl;
bank_bus_length = double(num_banks_ver_dir) * 0.5 * MAX(length_bank, height_bank);
bank_bus = new Wire(wt, bank_bus_length);
}
else if (membus_type == Data_path)
{
local_data = new Wire(wt, subarray_width, 1, 1, 1, inside_mat, CU_RESISTIVITY, &(g_tp.peri_global));
global_data = new Wire(wt, sqrt(length_bank * height_bank), 1, 1, 1, outside_mat, CU_RESISTIVITY, &(g_tp.peri_global));
if(semi_repeated_global_line)
{
C_global_data = g_tp.wire_inside_mat.C_per_um * (subarray_height + g_tp.wire_local.pitch);
R_global_data = g_tp.wire_inside_mat.R_per_um * (subarray_height + g_tp.wire_local.pitch) ;
}
else
{
C_global_data = g_tp.wire_inside_mat.C_per_um * height_bank /2;
R_global_data = g_tp.wire_inside_mat.R_per_um * height_bank /2;
}
global_data_drv = new Driver(
0,
C_global_data,
R_global_data,
is_dram);
global_data_drv->compute_delay(0);
global_data_drv->compute_area();
//---Unrepeated local dataline
double local_data_c_gate_load = dp.num_c_subarray * drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, cell.w, is_dram);
//double local_data_c_gate_load = 0;
double local_data_c_wire_load = dp.num_c_subarray * g_tp.dram.b_w * g_tp.wire_inside_mat.C_per_um;
double local_data_r_wire_load = dp.num_c_subarray * g_tp.dram.b_w * g_tp.wire_inside_mat.R_per_um;
//double local_data_r_gate_load = tr_R_on(g_tp.w_nmos_sa_mux, NCH, 1, is_dram);
double local_data_r_gate_load = 0;
double tf = (local_data_c_gate_load + local_data_c_wire_load) * (local_data_r_wire_load + local_data_r_gate_load);
double this_delay = horowitz(0, tf, 0.5, 0.5, RISE);
//double local_data_outrisetime = this_delay/(1.0-0.5);
//---Unrepeated and undriven local dataline, not significant growth
//local_data->delay = this_delay;
//local_data->power.readOp.dynamic = (local_data_c_gate_load + local_data_c_wire_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
double data_drv_c_gate_load = local_data_c_gate_load;
double data_drv_c_wire_load = local_data_c_wire_load;
double data_drv_r_wire_load = local_data_r_gate_load + local_data_r_wire_load;
//---Assume unrepeated global data path, too high RC
//double data_drv_c_wire_load = height_bank * g_tp.wire_outside_mat.C_per_um;
//double data_drv_r_wire_load = height_bank * g_tp.wire_inside_mat.R_per_um;
local_data_drv = new Driver(
data_drv_c_gate_load,
data_drv_c_wire_load,
data_drv_r_wire_load,
is_dram);
local_data_drv->compute_delay(0);
local_data_drv->compute_area();
if (g_ip->print_detail_debug)
{
cout<<"C: "<<local_data_c_gate_load + local_data_c_wire_load <<" F"<<endl;
cout<<"R: "<<local_data_r_gate_load + local_data_r_wire_load <<" Ohm"<<endl;
cout<<"this_delay" << this_delay * 1e9 <<" ns"<<endl;
cout<<" local_data_drv delay: " << local_data_drv->delay * 1e9 <<" ns"<<endl;
}
//Not accounted for.
/*local_data_drv = new Driver(
global_data->repeater_size * gate_C(g_tp.min_w_nmos_+min_w_pmos,0),
global_data->repeater_spacing * g_tp.wire_inside_mat.C_per_um,
global_data->repeater_spacing * g_tp.wire_inside_mat.R_per_um,
is_dram);*/
//bank_bus_length = double(num_banks_ver_dir) * 0.5 * (height_bank + 0.5*double(row_add_bits+col_add_bits+data_bits)*g_tp.wire_outside_mat.pitch) - height_bank + length_bank;
bank_bus_length = double(num_banks_ver_dir) * 0.5 * MAX(length_bank, height_bank);
bank_bus = new Wire(wt, bank_bus_length);
if (g_ip->print_detail_debug)
cout << "memorybus.cc: bank_bus_length = " << bank_bus_length << endl;
out_seg = new Wire(wt, 0.25 * num_banks_hor_dir * (length_bank + (row_add_bits+col_add_bits+data_bits)*g_tp.wire_outside_mat.pitch) );
area_IOSA = (875+500)*g_ip->F_sz_um*g_ip->F_sz_um * data_bits;//Reference:
area_data_drv = local_data_drv->area.get_area() * data_bits;
if(ndbl>16)
{
area_IOSA *= (double)ndbl/16.0;
area_data_drv *= (double)ndbl/16.0;
}
area_local_dataline = data_bits*subarray_width *g_tp.wire_local.pitch*ndbl;
}
// Row decoder
if (membus_type == Row_add_path || membus_type == Col_add_path )
{
if (g_ip->print_detail_debug)
{
cout << "memorybus.cc: num_dec_signals = " << num_dec_signals << endl;
cout << "memorybus.cc: C_ld_dec_out = " << C_ld_dec_out << endl;
cout << "memorybus.cc: R_wire_dec_out = " << R_wire_dec_out << endl;
cout << "memorybus.cc: is_dram = " << is_dram << endl;
cout << "memorybus.cc: cell.h = " << cell.h << endl;
}
add_dec = new Decoder(
(num_dec_signals>16)?num_dec_signals:16,
false,
C_ld_dec_out,
R_wire_dec_out,
false,
is_dram,
membus_type == Row_add_path?true:false,
cell);
// Predecoder and decoder for GWL
double C_wire_predec_blk_out;
double R_wire_predec_blk_out;
C_wire_predec_blk_out = 0; // num_subarrays_per_row * dp.num_r_subarray * g_tp.wire_inside_mat.C_per_um * cell.h;
R_wire_predec_blk_out = 0; // num_subarrays_per_row * dp.num_r_subarray * g_tp.wire_inside_mat.R_per_um * cell.h;
//int num_subarrays_per_mat = dp.num_subarrays/dp.num_mats;
int num_dec_per_predec = 1;
PredecBlk * add_predec_blk1 = new PredecBlk(
num_dec_signals,
add_dec,
C_wire_predec_blk_out,
R_wire_predec_blk_out,
num_dec_per_predec,
is_dram,
true);
PredecBlk * add_predec_blk2 = new PredecBlk(
num_dec_signals,
add_dec,
C_wire_predec_blk_out,
R_wire_predec_blk_out,
num_dec_per_predec,
is_dram,
false);
PredecBlkDrv * add_predec_blk_drv1 = new PredecBlkDrv(0, add_predec_blk1, is_dram);
PredecBlkDrv * add_predec_blk_drv2 = new PredecBlkDrv(0, add_predec_blk2, is_dram);
add_predec = new Predec(add_predec_blk_drv1, add_predec_blk_drv2);
if (membus_type == Row_add_path)
{
area_row_predec_dec = add_predec_blk_drv1->area.get_area() + add_predec_blk_drv2->area.get_area() +
add_predec_blk1->area.get_area() + add_predec_blk2->area.get_area() + num_dec_signals * add_dec->area.get_area();
area_lwl_drv = num_lwl_drv/2.0 * dp.num_r_subarray * ndbl * lwl_drv->area.get_area(); //num_lwl_drv is ndwl/the lwl driver count one gwl connects. two adjacent lwls share one driver.
if (g_ip->print_detail_debug)
{
cout<<"memorybus.cc: area_bank_vertical_peripheral_circuitry = " << area_bank_vertical_peripheral_circuitry /1e6<<" mm2"<<endl;
cout<<"memorybus.cc: lwl drv area = " << lwl_drv->area.get_area() /1e6<<" mm2"<<endl;
cout<<"memorybus.cc: total lwl drv area = " << num_lwl_drv * dp.num_r_subarray
* ndbl * lwl_drv->area.get_area() /1e6<<" mm2"<<endl;
}
}
else if (membus_type == Col_add_path)
{
area_col_predec_dec = add_predec_blk_drv1->area.get_area() + add_predec_blk_drv2->area.get_area() +
add_predec_blk1->area.get_area() + add_predec_blk2->area.get_area() + num_dec_signals * add_dec->area.get_area();
if(ndbl>16)
{
area_col_predec_dec *= (double)ndbl/16.0;
}
}
area_bank_vertical_peripheral_circuitry = area_row_predec_dec + area_lwl_drv + area_address_bus + area_data_bus ;
area_bank_horizontal_peripheral_circuitry = area_col_predec_dec + area_data_drv + (area_bus + area_IOSA)/g_ip->nbanks;
if (g_ip->print_detail_debug)
{
cout<<"memorybus.cc: add_predec_blk_drv1->area = " << add_predec_blk_drv1->area.get_area() /1e6<<" mm2"<<endl;
cout<<"memorybus.cc: add_predec_blk_drv2->area = " << add_predec_blk_drv2->area.get_area() /1e6<<" mm2"<<endl;
cout<<"memorybus.cc: add_predec_blk1->area = " << add_predec_blk1->area.get_area() /1e6<<" mm2"<<endl;
cout<<"memorybus.cc: add_predec_blk2->area = " << add_predec_blk2->area.get_area() /1e6<<" mm2"<<endl;
cout<<"memorybus.cc: total add_dec->area = " << num_dec_signals * add_dec->area.get_area() /1e6<<" mm2"<<endl;
cout<<"wire bus width for one bank = " << g_tp.wire_outside_mat.pitch * double(add_bits + add_bits + data_bits);
}
area.h = (height_bank + area_bank_horizontal_peripheral_circuitry /length_bank) * num_banks_ver_dir;
area.w = (length_bank + area_bank_vertical_peripheral_circuitry /height_bank) * num_banks_hor_dir; // bank bus, should add cmd wire and predec/decoder space
if(g_ip->partition_gran == 0)
{
area.h += g_tp.wire_outside_mat.pitch * double(add_bits + add_bits + data_bits); //center_stripe, should add cmd wire and other componets
area.w += g_tp.wire_outside_mat.pitch * double(add_bits + add_bits + data_bits); // + g_tp.wire_outside_mat.pitch * add_bits * 2.5;
}
//---This coefficient comes from the extra overhead of voltage regulator,
//---control logic, bank fuse, burst logic and I/O, see
//--- A 5.6ns Random Cycle 144Mb DRAM with 1.4Gb/s/pin and DDR3-SRAM Interface
//area.w *= 1.0672;
//area.h *= 1.0672;
if (g_ip->print_detail_debug)
{
cout<<"memorybus.cc: circuit height = "<<area_bank_horizontal_peripheral_circuitry /length_bank /1e3<<" mm"<<endl;
cout<<"memorybus.cc: circuit length = "<<area_bank_vertical_peripheral_circuitry /height_bank /1e3<<" mm"<<endl;
cout<<"memorybus.cc: area.h = "<<area.h/1e3<<" mm"<<endl;
cout<<"memorybus.cc: area.w = "<<area.w/1e3<<" mm"<<endl;
cout<<"memorybus.cc: area = "<<area.get_area()/1e6<<" mm2"<<endl;
}
}
compute_delays(0);
compute_power_energy();
}
// This is based on the same function in mat.cc
double Memorybus::compute_delays(double inrisetime)
{
// double outrisetime = 0;
double predec_outrisetime = 0, add_dec_outrisetime = 0;
double lwl_drv_outrisetime = 0;///, tf = 0;
//double local_data_drv_outrisetime = 0;
if (membus_type == Data_path)
{
delay = 0;
delay_bus = center_stripe->delay + bank_bus->delay;
delay += delay_bus;
//outrisetime = local_data_drv->compute_delay(inrisetime);
//local_data_drv_outrisetime = local_data_drv->delay;
delay_global_data = (semi_repeated_global_line >0) ? (global_data_drv->delay*num_subarray_global_IO) : (global_data_drv->delay + global_data->delay);
if(g_ip->partition_gran==0 || g_ip->partition_gran==1)
delay += delay_global_data;
//delay += local_data->delay;
delay_local_data = local_data_drv->delay;
delay += delay_local_data;
delay_data_buffer = 2 * 1e-6/(double)g_ip->sys_freq_MHz;
//delay += bank.mat.delay_subarray_out_drv_htree;
delay += delay_data_buffer;
//cout << 1e3/(double)g_ip->sys_freq_MHz<< endl;
//delay += out_seg->delay * burst_length;
if (g_ip->print_detail_debug)
cout << "memorybus.cc: data path delay = " << delay << endl;
out_rise_time = 0;
}
else
{
delay = 0;
delay_bus = center_stripe->delay + bank_bus->delay;
delay += delay_bus;
predec_outrisetime = add_predec->compute_delays(inrisetime);
add_dec_outrisetime = add_dec->compute_delays(predec_outrisetime);
delay_add_predecoder = add_predec->delay;
delay += delay_add_predecoder;
if (membus_type == Row_add_path)
{
if(semi_repeated_global_line)
{
delay_add_decoder = add_dec->delay * ndwl;
if(g_ip->page_sz_bits > 8192)
delay_add_decoder /= (double)(g_ip->page_sz_bits / 8192);
}
else
{
delay_add_decoder = add_dec->delay;
}
delay += delay_add_decoder;
// There is no function to compute_delay in wire.cc, need to double check if center_stripe->delay and bank_bus->delay is correct.
lwl_drv_outrisetime = lwl_drv->compute_delay(add_dec_outrisetime);
///tf = (lwl_driver_c_gate_load + lwl_driver_c_wire_load) * lwl_driver_r_wire_load;
// ### no need for global_WL->delay
// delay_WL = global_WL->delay + lwl_drv->delay + horowitz(lwl_drv_outrisetime, tf, 0.5, 0.5, RISE);
delay_lwl_drv = lwl_drv->delay;
if(!g_ip->fine_gran_bank_lvl)
delay += delay_lwl_drv;
if (g_ip->print_detail_debug)
cout << "memorybus.cc: row add path delay = " << delay << endl;
out_rise_time = lwl_drv_outrisetime;
}
else if (membus_type == Col_add_path)
{
if(semi_repeated_global_line)
{
delay_add_decoder = add_dec->delay * num_subarray_global_IO;
}
else
{
delay += column_sel->delay;
delay_add_decoder = add_dec->delay;
}
delay += delay_add_decoder;
out_rise_time = 0;
if (g_ip->print_detail_debug)
{
//cout << "memorybus.cc, compute_delays col: center_stripe->delay = " << center_stripe->delay << endl;
//cout << "memorybus.cc, compute_delays col: bank_bus->delay = " << bank_bus->delay << endl;
//cout << "memorybus.cc, compute_delays col: add_predec->delay = " << add_predec->delay << endl;
//cout << "memorybus.cc, compute_delays col: add_dec->delay = " << add_dec->delay << endl;
cout << "memorybus.cc: column add path delay = " << delay << endl;
}
}
else
{
assert(0);
}
}
// Double check!
out_rise_time = delay / (1.0-0.5);
// Is delay_wl_reset necessary here? Is the 'false' condition appropriate? See the same code as in mat.cc
/*if (add_dec->exist == false)
{
int delay_wl_reset = MAX(add_predec->blk1->delay, add_predec->blk2->delay);
//delay += delay_wl_reset;
}*/
return out_rise_time;
}
void Memorybus::compute_power_energy()
{
double coeff1[4] = {(double)add_bits, (double)add_bits, (double)add_bits, (double)add_bits};
double coeff2[4] = {(double)data_bits, (double)data_bits, (double)data_bits, (double)data_bits};
double coeff3[4] = {(double)num_lwl_drv, (double)num_lwl_drv, (double)num_lwl_drv, (double)num_lwl_drv};
double coeff4[4] = {(double)burst_length*chip_IO_width, (double)burst_length*chip_IO_width,
(double)burst_length*chip_IO_width, (double)burst_length*chip_IO_width};
double coeff5[4] = {(double)ndwl, (double)ndwl, (double)ndwl, (double)ndwl};
double coeff6[4] = {(double)num_subarray_global_IO, (double)num_subarray_global_IO, (double)num_subarray_global_IO, (double)num_subarray_global_IO};
//double coeff4[4] = {(double)num_dec_signals, (double)num_dec_signals, (double)num_dec_signals, (double)num_dec_signals};
switch (membus_type)
{
case Data_path:
power_bus = (center_stripe->power + bank_bus->power) * coeff2;
power_local_data = local_data_drv->power * coeff2;
power_global_data = semi_repeated_global_line >0 ? (global_data_drv->power*coeff2) : (global_data_drv->power+global_data->power);
power_global_data.readOp.dynamic = power_global_data.readOp.dynamic + 1.8/1e3*deviceType->Vdd*10.0/1e9/64*data_bits;
power = power_bus + power_local_data;
if(!g_ip->fine_gran_bank_lvl)
power = power + power_global_data;
//power += local_data->power;
power_burst = out_seg->power * coeff4;//Account for burst read, approxmate the wire length by the center stripe
//power = power + power_burst;
if(g_ip->print_detail_debug)
{
cout << "memorybus.cc: data path center stripe energy = " << center_stripe->power.readOp.dynamic*1e9 << " nJ" << endl;
cout << "memorybus.cc: data path bank bus energy = " << bank_bus->power.readOp.dynamic*1e9 << " nJ" << endl;
cout << "memorybus.cc: data path data driver energy = " << local_data_drv->power.readOp.dynamic*1e9 << " nJ" << endl;
}
break;
case Row_add_path:
power_bus = (center_stripe->power + bank_bus->power) * coeff1;
power_add_predecoder = add_predec->power;
if(semi_repeated_global_line)
{
power_add_decoders = add_dec->power * coeff5;
//power_add_decoders.readOp.dynamic /= (g_ip->page_sz_bits > 8192)?((double)g_ip->page_sz_bits/8192):1;
if(g_ip->page_sz_bits > 8192)
power_add_decoders.readOp.dynamic /= (double)(g_ip->page_sz_bits / 8192);
}
else
power_add_decoders = add_dec->power;// * (1<< add_predec->blk1->number_input_addr_bits);
power_lwl_drv = lwl_drv->power * coeff3;
//power_local_WL.readOp.dynamic = num_lwl_drv * C_LWL * deviceType->Vdd * deviceType->Vdd;
power = power_bus + power_add_predecoder + power_add_decoders + power_lwl_drv;
break;
case Col_add_path:
power_bus = (center_stripe->power + bank_bus->power) * coeff1;// + column_sel->power * double(chip_IO_width * burst_length);
power_add_predecoder = add_predec->power;
if(semi_repeated_global_line)
{
power_add_decoders = add_dec->power * coeff6;
power_add_decoders.readOp.dynamic = power_add_decoders.readOp.dynamic * g_ip->page_sz_bits / data_bits;
power_col_sel.readOp.dynamic = 0;
}
else
{
power_add_decoders = add_dec->power;// * (1<< add_predec->blk1->number_input_addr_bits);
power_col_sel.readOp.dynamic = column_sel->power.readOp.dynamic * g_ip->page_sz_bits / data_bits;
}
power = power_bus + power_add_predecoder + power_add_decoders;
if(!g_ip->fine_gran_bank_lvl)
power = power + power_col_sel;
break;
default:
assert(0);
break;
}
return;
}

View file

@ -0,0 +1,150 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __MEMORYBUS_H__
#define __MEMORYBUS_H__
#include "basic_circuit.h"
#include "component.h"
#include "parameter.h"
//#include "assert.h"
#include "cacti_interface.h"
//#include "wire.h"
class Wire;
//#include "area.h"
#include "decoder.h"
class Memorybus : public Component
{
public:
Memorybus(enum Wire_type wire_model, double mat_w, double mat_h, double subarray_w, double subarray_h,
int _row_add_bits, int _col_add_bits, int _data_bits, int _ndbl, int _ndwl, /*enum Htree_type htree_type,*/
enum Memorybus_type membus_type, const DynamicParameter & dp_,
/*TechnologyParameter::*/DeviceType *dt = &(g_tp.peri_global)
);
~Memorybus();
//void in_membus();
//void out_membus();
void Network();
// repeaters only at h-tree nodes
void limited_in_membus();
void limited_out_membus();
void input_nand(double s1, double s2, double l);
//void output_buffer(double s1, double s2, double l);
const DynamicParameter & dp;
double in_rise_time, out_rise_time;
void set_in_rise_time(double rt)
{
in_rise_time = rt;
}
double max_unpipelined_link_delay;
powerDef power_bit;
void memory_bus();
double height_bank, length_bank; // The actual height and length of a single bank including all wires between subarrays.
Wire * center_stripe;
Wire * bank_bus;
Wire * global_WL; //3 hierarchical connection wires.
Wire * column_sel;
Wire * local_data;
Wire * global_data;
Wire * out_seg;
// Driver for LWL connecting GWL, same as in mat.cc
double lwl_driver_c_gate_load, lwl_driver_c_wire_load, lwl_driver_r_wire_load;
powerDef power_bus;
powerDef power_lwl_drv;
powerDef power_add_decoders;
powerDef power_global_WL;
powerDef power_local_WL;
powerDef power_add_predecoder;
powerDef power_burst;
powerDef power_col_sel;
powerDef power_local_data;
powerDef power_global_data;
double delay_bus, delay_add_predecoder, delay_add_decoder, delay_lwl_drv, delay_global_data, delay_local_data, delay_data_buffer;
double area_lwl_drv, area_row_predec_dec, area_col_predec_dec, area_subarray, area_bus, area_address_bus, area_data_bus, area_data_drv, area_IOSA, area_local_dataline, area_sense_amp;
Area cell;
bool is_dram;
Driver * lwl_drv, * local_data_drv, * global_data_drv ;
Predec * add_predec;
Decoder * add_dec;
double compute_delays(double inrisetime); // return outrisetime
void compute_power_energy(); //
private:
double wire_bw;
double init_wire_bw; // bus width at root
enum Memorybus_type membus_type;
// double htree_hnodes;
// double htree_vnodes;
double mat_width;
double mat_height;
double subarray_width, subarray_height;
//int add_bits, data_in_bits,search_data_in_bits,data_out_bits, search_data_out_bits;
int row_add_bits, col_add_bits;
int add_bits, data_bits, num_dec_signals;
int semi_repeated_global_line;
int ndbl, ndwl;
// bool uca_tree; // should have full bandwidth to access all banks in the array simultaneously
// bool search_tree;
enum Wire_type wt;
double min_w_nmos;
double min_w_pmos;
int num_lwl_drv; //Ratio between GWL and LWL, how many local WL drives each GWL drives.
int chip_IO_width;
int burst_length;
int num_subarray_global_IO;
double C_GWL, C_LWL, R_GWL, R_LWL, C_colsel, R_colsel, C_global_data, R_global_data; // Capacitance of global/local WLs.
/*TechnologyParameter::*/DeviceType *deviceType;
};
#endif

611
T1/TP/TP1/cacti_7/nuca.cc Normal file
View file

@ -0,0 +1,611 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "nuca.h"
#include "Ucache.h"
#include <assert.h>
unsigned int MIN_BANKSIZE=65536;
#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */
#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */
#define CONTR_2_BANK_LAT 0
int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */];
Nuca::Nuca(
/*TechnologyParameter::*/DeviceType *dt = &(g_tp.peri_global)
):deviceType(dt)
{
init_cont();
}
void
Nuca::init_cont()
{
FILE *cont;
char line[5000];
char jk[5000];
cont = fopen("contention.dat", "r");
if (!cont) {
cout << "contention.dat file is missing!\n";
exit(0);
}
for(int i=0; i<2; i++) {
for(int j=2; j<5; j++) {
for(int k=0; k<ROUTER_TYPES; k++) {
for(int l=0;l<7; l++) {
int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/];
assert(fscanf(cont, "%[^\n]\n", line) != EOF);
sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3],
&temp[4], &temp[5], &temp[6], &temp[7]);
}
}
}
}
fclose(cont);
}
void
Nuca::print_cont_stats()
{
for(int i=0; i<2; i++) {
for(int j=2; j<5; j++) {
for(int k=0; k<ROUTER_TYPES; k++) {
for(int l=0;l<7; l++) {
for(int m=0;l<7; l++) {
cout << cont_stats[i][j][k][l][m] << " ";
}
cout << endl;
}
}
}
}
cout << endl;
}
Nuca::~Nuca(){
for (int i = wt_min; i <= wt_max; i++) {
delete wire_vertical[i];
delete wire_horizontal[i];
}
}
/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */
int
Nuca::calc_cycles(double lat, double oper_freq)
{
//TODO: convert latch delay to FO4 */
double cycle_time = (1.0/(oper_freq*1e9)); /*s*/
cycle_time -= LATCH_DELAY;
cycle_time -= FIXED_OVERHEAD;
return (int)ceil(lat/cycle_time);
}
nuca_org_t::~nuca_org_t() {
// if(h_wire) delete h_wire;
// if(v_wire) delete v_wire;
// if(router) delete router;
}
/*
* Version - 6.0
*
* Perform exhaustive search across different bank organizatons,
* router configurations, grid organizations, and wire models and
* find an optimal NUCA organization
* For different bank count values
* 1. Optimal bank organization is calculated
* 2. For each bank organization, find different NUCA organizations
* using various router configurations, grid organizations,
* and wire models.
* 3. NUCA model with the least cost is picked for
* this particular bank count
* Finally include contention statistics and find the optimal
* NUCA configuration
*/
void
Nuca::sim_nuca()
{
/* temp variables */
int it, ro, wr;
int num_cyc;
unsigned int i, j;//, k;
unsigned int r, c;
int l2_c;
int bank_count = 0;
uca_org_t ures;
nuca_org_t *opt_n;
mem_array tag, data;
list<nuca_org_t *> nuca_list;
Router *router_s[ROUTER_TYPES];
router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global));
router_s[0]->print_router();
router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global));
router_s[1]->print_router();
router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global));
router_s[2]->print_router();
int core_in; // to store no. of cores
/* to search diff grid organizations */
double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
curr_acclat;
double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
avg_leakage_power;
double opt_acclat = INF;//, opt_avg_lat = INF, opt_tot_lat = INF;
int opt_rows = 0;
int opt_columns = 0;
// double opt_totno_hops = 0;
double opt_avg_hop = 0;
double opt_dyn_power = 0, opt_leakage_power = 0;
min_values_t minval;
int bank_start = 0;
int flit_width = 0;
/* vertical and horizontal hop latency values */
int ver_hop_lat, hor_hop_lat; /* in cycles */
/* no. of different bank sizes to consider */
int iterations;
g_ip->nuca_cache_sz = g_ip->cache_sz;
nuca_list.push_back(new nuca_org_t());
if (g_ip->cache_level == 0) l2_c = 1;
else l2_c = 0;
if (g_ip->cores <= 4) core_in = 2;
else if (g_ip->cores <= 8) core_in = 3;
else if (g_ip->cores <= 16) core_in = 4;
else {cout << "Number of cores should be <= 16!\n"; exit(0);}
// set the lower bound to an appropriate value. this depends on cache associativity
if (g_ip->assoc > 2) {
i = 2;
while (i != g_ip->assoc) {
MIN_BANKSIZE *= 2;
i *= 2;
}
}
iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE);
if (g_ip->force_wiretype)
{
if (g_ip->wt == Low_swing) {
wt_min = Low_swing;
wt_max = Low_swing;
}
else {
wt_min = Global;
wt_max = Low_swing-1;
}
}
else {
wt_min = Global;
wt_max = Low_swing;
}
if (g_ip->nuca_bank_count != 0) { // simulate just one bank
if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n");
}
bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
iterations = bank_start+1;
g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count;
}
cout << "Simulating various NUCA configurations\n";
for (it=bank_start; it<iterations; it++) { /* different bank count values */
ures.tag_array2 = &tag;
ures.data_array2 = &data;
/*
* find the optimal bank organization
*/
solve(&ures);
// output_UCA(&ures);
bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz;
cout << "====" << g_ip->cache_sz << "\n";
for (wr=wt_min; wr<=wt_max; wr++) {
for (ro=0; ro<ROUTER_TYPES; ro++)
{
flit_width = (int) router_s[ro]->flit_size; //initialize router
nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;
/* calculate router and wire parameters */
double vlength = ures.cache_ht; /* length of the wire (u)*/
double hlength = ures.cache_len; // u
/* find delay, area, and power for wires */
wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);
hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay,
1/(nuca_list.back()->nuca_pda.cycle_time*.001));
ver_hop_lat = calc_cycles(wire_vertical[wr]->delay,
1/(nuca_list.back()->nuca_pda.cycle_time*.001));
/*
* assume a grid like topology and explore for optimal network
* configuration using different row and column count values.
*/
for (c=1; c<=(unsigned int)bank_count; c++) {
while (bank_count%c != 0) c++;
r = bank_count/c;
/*
* to find the avg access latency of a NUCA cache, uncontended
* access time to each bank from the
* cache controller is calculated.
* avg latency =
* sum of the access latencies to individual banks)/bank
* count value.
*/
totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
/// k = 1;
for (i=0; i<r; i++) {
for (j=0; j<c; j++) {
/*
* vertical hops including the
* first hop from the cache controller
*/
curr_hop = i + 1;
curr_hop += j; /* horizontal hops */
totno_hhops += j;
totno_vhops += (i+1);
curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
j * hor_hop_lat);
tot_lat += curr_acclat;
totno_hops += curr_hop;
}
}
avg_lat = tot_lat/bank_count;
avg_hop = totno_hops/bank_count;
avg_hhop = totno_hhops/bank_count;
avg_vhop = totno_vhops/bank_count;
/* net access latency */
curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) +
calc_cycles(ures.access_time,
1/(nuca_list.back()->nuca_pda.cycle_time*.001));
/* avg access lat of nuca */
avg_dyn_power =
avg_hop *
(router_s[ro]->power.readOp.dynamic) + avg_hhop *
(wire_horizontal[wr]->power.readOp.dynamic) *
(g_ip->block_sz*8 + 64) + avg_vhop *
(wire_vertical[wr]->power.readOp.dynamic) *
(g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic;
avg_leakage_power =
bank_count * router_s[ro]->power.readOp.leakage +
avg_hhop * (wire_horizontal[wr]->power.readOp.leakage*
wire_horizontal[wr]->delay) * flit_width +
avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
wire_horizontal[wr]->delay);
if (curr_acclat < opt_acclat) {
opt_acclat = curr_acclat;
/// opt_tot_lat = tot_lat;
/// opt_avg_lat = avg_lat;
/// opt_totno_hops = totno_hops;
opt_avg_hop = avg_hop;
opt_rows = r;
opt_columns = c;
opt_dyn_power = avg_dyn_power;
opt_leakage_power = avg_leakage_power;
}
totno_hops = 0;
tot_lat = 0;
totno_hhops = 0;
totno_vhops = 0;
}
nuca_list.back()->wire_pda.power.readOp.dynamic =
opt_avg_hop * flit_width *
(wire_horizontal[wr]->power.readOp.dynamic +
wire_vertical[wr]->power.readOp.dynamic);
nuca_list.back()->avg_hops = opt_avg_hop;
/* network delay/power */
nuca_list.back()->h_wire = wire_horizontal[wr];
nuca_list.back()->v_wire = wire_vertical[wr];
nuca_list.back()->router = router_s[ro];
/* bank delay/power */
nuca_list.back()->bank_pda.delay = ures.access_time;
nuca_list.back()->bank_pda.power = ures.power;
nuca_list.back()->bank_pda.area.h = ures.cache_ht;
nuca_list.back()->bank_pda.area.w = ures.cache_len;
nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;
num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/));
if(num_cyc%2 != 0) num_cyc++;
if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles
if (it < 7) {
nuca_list.back()->nuca_pda.delay = opt_acclat +
cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
nuca_list.back()->contention =
cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
}
else {
nuca_list.back()->nuca_pda.delay = opt_acclat +
cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
nuca_list.back()->contention =
cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
}
nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;
/* array organization */
nuca_list.back()->bank_count = bank_count;
nuca_list.back()->rows = opt_rows;
nuca_list.back()->columns = opt_columns;
calculate_nuca_area (nuca_list.back());
minval.update_min_values(nuca_list.back());
nuca_list.push_back(new nuca_org_t());
opt_acclat = BIGNUM;
}
}
g_ip->cache_sz /= 2;
}
delete(nuca_list.back());
nuca_list.pop_back();
opt_n = find_optimal_nuca(&nuca_list, &minval);
print_nuca(opt_n);
g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count;
list<nuca_org_t *>::iterator niter;
for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter)
{
delete *niter;
}
nuca_list.clear();
for(int i=0; i < ROUTER_TYPES; i++)
{
delete router_s[i];
}
g_ip->display_ip();
// g_ip->force_cache_config = true;
// g_ip->ndwl = 8;
// g_ip->ndbl = 16;
// g_ip->nspd = 4;
// g_ip->ndcm = 1;
// g_ip->ndsam1 = 8;
// g_ip->ndsam2 = 32;
}
void
Nuca::print_nuca (nuca_org_t *fr)
{
printf("\n---------- CACTI version 6.5, Non-uniform Cache Access "
"----------\n\n");
printf("Optimal number of banks - %d\n", fr->bank_count);
printf("Grid organization rows x columns - %d x %d\n",
fr->rows, fr->columns);
printf("Network frequency - %g GHz\n",
(1/fr->nuca_pda.cycle_time)*1e3);
printf("Cache dimension (mm x mm) - %g x %g\n",
fr->nuca_pda.area.h*1e-3,
fr->nuca_pda.area.w*1e-3);
fr->router->print_router();
printf("\n\nWire stats:\n");
if (fr->h_wire->wt == Global) {
printf("\tWire type - Full swing global wires with least "
"possible delay\n");
}
else if (fr->h_wire->wt == Global_5) {
printf("\tWire type - Full swing global wires with "
"5%% delay penalty\n");
}
else if (fr->h_wire->wt == Global_10) {
printf("\tWire type - Full swing global wires with "
"10%% delay penalty\n");
}
else if (fr->h_wire->wt == Global_20) {
printf("\tWire type - Full swing global wires with "
"20%% delay penalty\n");
}
else if (fr->h_wire->wt == Global_30) {
printf("\tWire type - Full swing global wires with "
"30%% delay penalty\n");
}
else if(fr->h_wire->wt == Low_swing) {
printf("\tWire type - Low swing wires\n");
}
printf("\tHorizontal link delay - %g (ns)\n",
fr->h_wire->delay*1e9);
printf("\tVertical link delay - %g (ns)\n",
fr->v_wire->delay*1e9);
printf("\tDelay/length - %g (ns/mm)\n",
fr->h_wire->delay*1e9/fr->bank_pda.area.w);
printf("\tHorizontal link energy -dynamic/access %g (nJ)\n"
"\t -leakage %g (nW)\n\n",
fr->h_wire->power.readOp.dynamic*1e9,
fr->h_wire->power.readOp.leakage*1e9);
printf("\tVertical link energy -dynamic/access %g (nJ)\n"
"\t -leakage %g (nW)\n\n",
fr->v_wire->power.readOp.dynamic*1e9,
fr->v_wire->power.readOp.leakage*1e9);
printf("\n\n");
fr->v_wire->print_wire();
printf("\n\nBank stats:\n");
}
nuca_org_t *
Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval)
{
double cost = 0;
double min_cost = BIGNUM;
nuca_org_t *res = NULL;
float d, a, dp, lp, c;
int v;
dp = g_ip->dynamic_power_wt_nuca;
lp = g_ip->leakage_power_wt_nuca;
a = g_ip->area_wt_nuca;
d = g_ip->delay_wt_nuca;
c = g_ip->cycle_time_wt_nuca;
list<nuca_org_t *>::iterator niter;
for (niter = n->begin(); niter != n->end(); niter++) {
fprintf(stderr, "\n-----------------------------"
"---------------\n");
printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t "
"bank_dpower = %g \tleak = %g \tcycle = %g\n",
(*niter)->bank_count,
(*niter)->nuca_pda.delay,
(*niter)->nuca_pda.power.readOp.dynamic,
(*niter)->h_wire->wt,
(*niter)->bank_pda.power.readOp.dynamic,
(*niter)->nuca_pda.power.readOp.leakage,
(*niter)->nuca_pda.cycle_time);
if (g_ip->ed == 1) {
cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
if (min_cost > cost) {
min_cost = cost;
res = ((*niter));
}
}
else if (g_ip->ed == 2) {
cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
((*niter)->nuca_pda.delay/minval->min_delay)*
((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
if (min_cost > cost) {
min_cost = cost;
res = ((*niter));
}
}
else {
/*
* check whether the current organization
* meets the input deviation constraints
*/
v = check_nuca_org((*niter), minval);
if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
if (v) {
cost = (d * ((*niter)->nuca_pda.delay/minval->min_delay) +
c * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) +
dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) +
lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) +
a * ((*niter)->nuca_pda.area.get_area()/minval->min_area));
fprintf(stderr, "cost = %g\n", cost);
if (min_cost > cost) {
min_cost = cost;
res = ((*niter));
}
}
else {
niter = n->erase(niter);
if (niter !=n->begin())
niter --;
}
}
}
return res;
}
int
Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval)
{
if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) {
return 0;
}
if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
g_ip->dynamic_power_dev_nuca) {
return 0;
}
if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
g_ip->leakage_power_dev_nuca) {
return 0;
}
if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
g_ip->cycle_time_dev_nuca) {
return 0;
}
if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 >
g_ip->area_dev_nuca) {
return 0;
}
return 1;
}
void
Nuca::calculate_nuca_area (nuca_org_t *nuca)
{
nuca->nuca_pda.area.h=
nuca->rows * ((nuca->h_wire->wire_width +
nuca->h_wire->wire_spacing)
* nuca->router->flit_size +
nuca->bank_pda.area.h);
nuca->nuca_pda.area.w =
nuca->columns * ((nuca->v_wire->wire_width +
nuca->v_wire->wire_spacing)
* nuca->router->flit_size +
nuca->bank_pda.area.w);
}

101
T1/TP/TP1/cacti_7/nuca.h Normal file
View file

@ -0,0 +1,101 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __NUCA_H__
#define __NUCA_H__
#include "basic_circuit.h"
#include "component.h"
#include "parameter.h"
#include "assert.h"
#include "cacti_interface.h"
#include "wire.h"
#include "mat.h"
#include "io.h"
#include "router.h"
#include <iostream>
class nuca_org_t {
public:
~nuca_org_t();
// int size;
/* area, power, access time, and cycle time stats */
Component nuca_pda;
Component bank_pda;
Component wire_pda;
Wire *h_wire;
Wire *v_wire;
Router *router;
/* for particular network configuration
* calculated based on a cycle accurate
* simulation Ref: CACTI 6 - Tech report
*/
double contention;
/* grid network stats */
double avg_hops;
int rows;
int columns;
int bank_count;
};
class Nuca : public Component
{
public:
Nuca(
/*TechnologyParameter::*/DeviceType *dt);
void print_router();
~Nuca();
void sim_nuca();
void init_cont();
int calc_cycles(double lat, double oper_freq);
void calculate_nuca_area (nuca_org_t *nuca);
int check_nuca_org (nuca_org_t *n, min_values_t *minval);
nuca_org_t * find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval);
void print_nuca(nuca_org_t *n);
void print_cont_stats();
private:
/*TechnologyParameter::*/DeviceType *deviceType;
int wt_min, wt_max;
Wire *wire_vertical[WIRE_TYPES],
*wire_horizontal[WIRE_TYPES];
};
#endif

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
T1/TP/TP1/cacti_7/obj_dbg/cacti Executable file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,779 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __PARAMETER_H__
#define __PARAMETER_H__
#include "area.h"
#include "const.h"
#include "cacti_interface.h"
#include "io.h"
// parameters which are functions of certain device technology
/**
class TechnologyParameter
{
public:
class DeviceType
{
public:
double C_g_ideal;
double C_fringe;
double C_overlap;
double C_junc; // C_junc_area
double C_junc_sidewall;
double l_phy;
double l_elec;
double R_nch_on;
double R_pch_on;
double Vdd;
double Vth;
double Vcc_min;//allowed min vcc; for memory cell it is the lowest vcc for data retention. for logic it is the vcc to balance the leakage reduction and wakeup latency
double I_on_n;
double I_on_p;
double I_off_n;
double I_off_p;
double I_g_on_n;
double I_g_on_p;
double C_ox;
double t_ox;
double n_to_p_eff_curr_drv_ratio;
double long_channel_leakage_reduction;
double Mobility_n;
DeviceType(): C_g_ideal(0), C_fringe(0), C_overlap(0), C_junc(0),
C_junc_sidewall(0), l_phy(0), l_elec(0), R_nch_on(0), R_pch_on(0),
Vdd(0), Vth(0), Vcc_min(0),
I_on_n(0), I_on_p(0), I_off_n(0), I_off_p(0),I_g_on_n(0),I_g_on_p(0),
C_ox(0), t_ox(0), n_to_p_eff_curr_drv_ratio(0), long_channel_leakage_reduction(0),
Mobility_n(0) { };
void reset()
{
C_g_ideal = 0;
C_fringe = 0;
C_overlap = 0;
C_junc = 0;
l_phy = 0;
l_elec = 0;
R_nch_on = 0;
R_pch_on = 0;
Vdd = 0;
Vth = 0;
Vcc_min = 0;
I_on_n = 0;
I_on_p = 0;
I_off_n = 0;
I_off_p = 0;
I_g_on_n = 0;
I_g_on_p = 0;
C_ox = 0;
t_ox = 0;
n_to_p_eff_curr_drv_ratio = 0;
long_channel_leakage_reduction = 0;
Mobility_n = 0;
}
void display(uint32_t indent = 0);
};
class InterconnectType
{
public:
double pitch;
double R_per_um;
double C_per_um;
double horiz_dielectric_constant;
double vert_dielectric_constant;
double aspect_ratio;
double miller_value;
double ild_thickness;
InterconnectType(): pitch(0), R_per_um(0), C_per_um(0) { };
void reset()
{
pitch = 0;
R_per_um = 0;
C_per_um = 0;
horiz_dielectric_constant = 0;
vert_dielectric_constant = 0;
aspect_ratio = 0;
miller_value = 0;
ild_thickness = 0;
}
void display(uint32_t indent = 0);
};
class MemoryType
{
public:
double b_w;
double b_h;
double cell_a_w;
double cell_pmos_w;
double cell_nmos_w;
double Vbitpre;
double Vbitfloating;//voltage when floating bitline is supported
void reset()
{
b_w = 0; //fs and tech
b_h = 0; //fs and tech
cell_a_w = 0; // ram_cell_tech_type
cell_pmos_w = 0; //fs
cell_nmos_w = 0;
Vbitpre = 0;
Vbitfloating = 0;
}
void display(uint32_t indent = 0);
};
class ScalingFactor
{
public:
double logic_scaling_co_eff;
double core_tx_density;
double long_channel_leakage_reduction;
ScalingFactor(): logic_scaling_co_eff(0), core_tx_density(0),
long_channel_leakage_reduction(0) { };
void reset()
{
logic_scaling_co_eff= 0;
core_tx_density = 0;
long_channel_leakage_reduction= 0;
}
void display(uint32_t indent = 0);
};
double ram_wl_stitching_overhead_; //fs
double min_w_nmos_; //fs
double max_w_nmos_; //fs
double max_w_nmos_dec; //fs+ ram_cell_tech_type
double unit_len_wire_del; //wire_inside_mat
double FO4; //fs
double kinv; //fs
double vpp; //input
double w_sense_en;//fs
double w_sense_n; //fs
double w_sense_p; //fs
double sense_delay; // input
double sense_dy_power; //input
double w_iso; //fs
double w_poly_contact; //fs
double spacing_poly_to_poly; //fs
double spacing_poly_to_contact;//fs
//CACTI3DD TSV params
double tsv_parasitic_capacitance_fine;
double tsv_parasitic_resistance_fine;
double tsv_minimum_area_fine;
double tsv_parasitic_capacitance_coarse;
double tsv_parasitic_resistance_coarse;
double tsv_minimum_area_coarse;
//fs
double w_comp_inv_p1;
double w_comp_inv_p2;
double w_comp_inv_p3;
double w_comp_inv_n1;
double w_comp_inv_n2;
double w_comp_inv_n3;
double w_eval_inv_p;
double w_eval_inv_n;
double w_comp_n;
double w_comp_p;
double dram_cell_I_on; //ram_cell_tech_type
double dram_cell_Vdd;
double dram_cell_I_off_worst_case_len_temp;
double dram_cell_C;
double gm_sense_amp_latch; // depends on many things
double w_nmos_b_mux;//fs
double w_nmos_sa_mux;//fs
double w_pmos_bl_precharge;//fs
double w_pmos_bl_eq;//fs
double MIN_GAP_BET_P_AND_N_DIFFS;//fs
double MIN_GAP_BET_SAME_TYPE_DIFFS;//fs
double HPOWERRAIL;//fs
double cell_h_def;//fs
double chip_layout_overhead; //input
double macro_layout_overhead;
double sckt_co_eff;
double fringe_cap;//input
uint64_t h_dec; //ram_cell_tech_type
DeviceType sram_cell; // SRAM cell transistor
DeviceType dram_acc; // DRAM access transistor
DeviceType dram_wl; // DRAM wordline transistor
DeviceType peri_global; // peripheral global
DeviceType cam_cell; // SRAM cell transistor
DeviceType sleep_tx; // Sleep transistor cell transistor
InterconnectType wire_local;
InterconnectType wire_inside_mat;
InterconnectType wire_outside_mat;
ScalingFactor scaling_factor;
MemoryType sram;
MemoryType dram;
MemoryType cam;
void display(uint32_t indent = 0);
void reset()
{
dram_cell_Vdd = 0;
dram_cell_I_on = 0;
dram_cell_C = 0;
vpp = 0;
sense_delay = 0;
sense_dy_power = 0;
fringe_cap = 0;
// horiz_dielectric_constant = 0;
// vert_dielectric_constant = 0;
// aspect_ratio = 0;
// miller_value = 0;
// ild_thickness = 0;
dram_cell_I_off_worst_case_len_temp = 0;
sram_cell.reset();
dram_acc.reset();
dram_wl.reset();
peri_global.reset();
cam_cell.reset();
sleep_tx.reset();
scaling_factor.reset();
wire_local.reset();
wire_inside_mat.reset();
wire_outside_mat.reset();
sram.reset();
dram.reset();
cam.reset();
chip_layout_overhead = 0;
macro_layout_overhead = 0;
sckt_co_eff = 0;
}
};
**/
//ali
class DeviceType
{
public:
double C_g_ideal;
double C_fringe;
double C_overlap;
double C_junc; // C_junc_area
double C_junc_sidewall;
double l_phy;
double l_elec;
double R_nch_on;
double R_pch_on;
double Vdd;
double Vth;
double Vcc_min;//allowed min vcc; for memory cell it is the lowest vcc for data retention. for logic it is the vcc to balance the leakage reduction and wakeup latency
double I_on_n;
double I_on_p;
double I_off_n;
double I_off_p;
double I_g_on_n;
double I_g_on_p;
double C_ox;
double t_ox;
double n_to_p_eff_curr_drv_ratio;
double long_channel_leakage_reduction;
double Mobility_n;
// auxilary parameters
double Vdsat;
double gmp_to_gmn_multiplier;
DeviceType(): C_g_ideal(0), C_fringe(0), C_overlap(0), C_junc(0),
C_junc_sidewall(0), l_phy(0), l_elec(0), R_nch_on(0), R_pch_on(0),
Vdd(0), Vth(0), Vcc_min(0),
I_on_n(0), I_on_p(0), I_off_n(0), I_off_p(0),I_g_on_n(0),I_g_on_p(0),
C_ox(0), t_ox(0), n_to_p_eff_curr_drv_ratio(0), long_channel_leakage_reduction(0),
Mobility_n(0) { reset();};
void assign(const string & in_file, int tech_flavor, unsigned int temp);
void interpolate(double alpha, const DeviceType& dev1, const DeviceType& dev2);
void reset()
{
C_g_ideal=0;
C_fringe=0;
C_overlap=0;
C_junc=0; // C_junc_area
C_junc_sidewall=0;
l_phy=0;
l_elec=0;
R_nch_on=0;
R_pch_on=0;
Vdd=0;
Vth=0;
Vcc_min=0;//allowed min vcc, for memory cell it is the lowest vcc for data retention. for logic it is the vcc to balance the leakage reduction and wakeup latency
I_on_n=0;
I_on_p=0;
I_off_n=0;
I_off_p=0;
I_g_on_n=0;
I_g_on_p=0;
C_ox=0;
t_ox=0;
n_to_p_eff_curr_drv_ratio=0;
long_channel_leakage_reduction=0;
Mobility_n=0;
// auxilary parameters
Vdsat=0;
gmp_to_gmn_multiplier=0;
}
void display(uint32_t indent = 0) const;
bool isEqual(const DeviceType & dev);
};
class InterconnectType
{
public:
double pitch;
double R_per_um;
double C_per_um;
double horiz_dielectric_constant;
double vert_dielectric_constant;
double aspect_ratio;
double miller_value;
double ild_thickness;
//auxilary parameters
double wire_width;
double wire_thickness;
double wire_spacing;
double barrier_thickness;
double dishing_thickness;
double alpha_scatter;
double fringe_cap;
InterconnectType(): pitch(0), R_per_um(0), C_per_um(0) { reset(); };
void reset()
{
pitch=0;
R_per_um=0;
C_per_um=0;
horiz_dielectric_constant=0;
vert_dielectric_constant=0;
aspect_ratio=0;
miller_value=0;
ild_thickness=0;
//auxilary parameters
wire_width=0;
wire_thickness=0;
wire_spacing=0;
barrier_thickness=0;
dishing_thickness=0;
alpha_scatter=0;
fringe_cap=0;
}
void assign(const string & in_file, int projection_type, int tech_flavor);
void interpolate(double alpha, const InterconnectType & inter1, const InterconnectType & inter2);
void display(uint32_t indent = 0);
bool isEqual(const InterconnectType & inter);
};
class MemoryType
{
public:
double b_w;
double b_h;
double cell_a_w;
double cell_pmos_w;
double cell_nmos_w;
double Vbitpre;
double Vbitfloating;//voltage when floating bitline is supported
// needed to calculate b_w b_h
double area_cell;
double asp_ratio_cell;
MemoryType(){reset();}
void reset()
{
b_w=0;
b_h=0;
cell_a_w=0;
cell_pmos_w=0;
cell_nmos_w=0;
Vbitpre=0;
Vbitfloating=0;
}
void assign(const string & in_file, int tech_flavor, int cell_type); // sram(0),cam(1),dram(2)
void interpolate(double alpha, const MemoryType& dev1, const MemoryType& dev2);
void display(uint32_t indent = 0) const;
bool isEqual(const MemoryType & mem);
};
class ScalingFactor
{
public:
double logic_scaling_co_eff;
double core_tx_density;
double long_channel_leakage_reduction;
ScalingFactor(): logic_scaling_co_eff(0), core_tx_density(0),
long_channel_leakage_reduction(0) { reset(); };
void reset()
{
logic_scaling_co_eff=0;
core_tx_density=0;
long_channel_leakage_reduction=0;
}
void assign(const string & in_file);
void interpolate(double alpha, const ScalingFactor& dev1, const ScalingFactor& dev2);
void display(uint32_t indent = 0);
bool isEqual(const ScalingFactor & scal);
};
// parameters which are functions of certain device technology
class TechnologyParameter
{
public:
double ram_wl_stitching_overhead_; //fs
double min_w_nmos_; //fs
double max_w_nmos_; //fs
double max_w_nmos_dec; //fs+ ram_cell_tech_type
double unit_len_wire_del; //wire_inside_mat
double FO4; //fs
double kinv; //fs
double vpp; //input
double w_sense_en;//fs
double w_sense_n; //fs
double w_sense_p; //fs
double sense_delay; // input
double sense_dy_power; //input
double w_iso; //fs
double w_poly_contact; //fs
double spacing_poly_to_poly; //fs
double spacing_poly_to_contact;//fs
//CACTI3D auxilary variables
double tsv_pitch;
double tsv_diameter;
double tsv_length;
double tsv_dielec_thickness;
double tsv_contact_resistance;
double tsv_depletion_width;
double tsv_liner_dielectric_constant;
//CACTI3DD TSV params
double tsv_parasitic_capacitance_fine;
double tsv_parasitic_resistance_fine;
double tsv_minimum_area_fine;
double tsv_parasitic_capacitance_coarse;
double tsv_parasitic_resistance_coarse;
double tsv_minimum_area_coarse;
//fs
double w_comp_inv_p1;
double w_comp_inv_p2;
double w_comp_inv_p3;
double w_comp_inv_n1;
double w_comp_inv_n2;
double w_comp_inv_n3;
double w_eval_inv_p;
double w_eval_inv_n;
double w_comp_n;
double w_comp_p;
double dram_cell_I_on; //ram_cell_tech_type
double dram_cell_Vdd;
double dram_cell_I_off_worst_case_len_temp;
double dram_cell_C;
double gm_sense_amp_latch; // depends on many things
double w_nmos_b_mux;//fs
double w_nmos_sa_mux;//fs
double w_pmos_bl_precharge;//fs
double w_pmos_bl_eq;//fs
double MIN_GAP_BET_P_AND_N_DIFFS;//fs
double MIN_GAP_BET_SAME_TYPE_DIFFS;//fs
double HPOWERRAIL;//fs
double cell_h_def;//fs
double chip_layout_overhead; //input
double macro_layout_overhead;
double sckt_co_eff;
double fringe_cap;//input
uint64_t h_dec; //ram_cell_tech_type
DeviceType sram_cell; // SRAM cell transistor
DeviceType dram_acc; // DRAM access transistor
DeviceType dram_wl; // DRAM wordline transistor
DeviceType peri_global; // peripheral global
DeviceType cam_cell; // SRAM cell transistor
DeviceType sleep_tx; // Sleep transistor cell transistor
InterconnectType wire_local;
InterconnectType wire_inside_mat;
InterconnectType wire_outside_mat;
ScalingFactor scaling_factor;
MemoryType sram;
MemoryType dram;
MemoryType cam;
void display(uint32_t indent = 0);
bool isEqual(const TechnologyParameter & tech);
void find_upper_and_lower_tech(double technology, int &tech_lo, string& in_file_lo, int &tech_hi, string& in_file_hi);
void assign_tsv(const string & in_file);
void init(double technology, bool is_tag);
TechnologyParameter()
{
reset();
}
void reset()
{
ram_wl_stitching_overhead_ =0; //fs
min_w_nmos_ =0; //fs
max_w_nmos_ =0; //fs
max_w_nmos_dec =0; //fs+ ram_cell_tech_type
unit_len_wire_del =0; //wire_inside_mat
FO4 =0; //fs
kinv =0; //fs
vpp =0; //input
w_sense_en =0;//fs
w_sense_n =0; //fs
w_sense_p =0; //fs
sense_delay =0; // input
sense_dy_power =0; //input
w_iso =0; //fs
w_poly_contact =0; //fs
spacing_poly_to_poly =0; //fs
spacing_poly_to_contact =0;//fs
//CACTI3D auxilary variables
tsv_pitch =0;
tsv_diameter =0;
tsv_length =0;
tsv_dielec_thickness =0;
tsv_contact_resistance =0;
tsv_depletion_width =0;
tsv_liner_dielectric_constant =0;
//CACTI3DD TSV params
tsv_parasitic_capacitance_fine =0;
tsv_parasitic_resistance_fine =0;
tsv_minimum_area_fine =0;
tsv_parasitic_capacitance_coarse =0;
tsv_parasitic_resistance_coarse =0;
tsv_minimum_area_coarse =0;
//fs
w_comp_inv_p1 =0;
w_comp_inv_p2 =0;
w_comp_inv_p3 =0;
w_comp_inv_n1 =0;
w_comp_inv_n2 =0;
w_comp_inv_n3 =0;
w_eval_inv_p =0;
w_eval_inv_n =0;
w_comp_n =0;
w_comp_p =0;
dram_cell_I_on =0; //ram_cell_tech_type
dram_cell_Vdd =0;
dram_cell_I_off_worst_case_len_temp =0;
dram_cell_C =0;
gm_sense_amp_latch =0; // depends on many things
w_nmos_b_mux =0;//fs
w_nmos_sa_mux =0;//fs
w_pmos_bl_precharge =0;//fs
w_pmos_bl_eq =0;//fs
MIN_GAP_BET_P_AND_N_DIFFS =0;//fs
MIN_GAP_BET_SAME_TYPE_DIFFS =0;//fs
HPOWERRAIL =0;//fs
cell_h_def =0;//fs
chip_layout_overhead = 0;
macro_layout_overhead = 0;
sckt_co_eff = 0;
fringe_cap=0;//input
h_dec=0; //ram_cell_tech_type
sram_cell.reset();
dram_acc.reset();
dram_wl.reset();
peri_global.reset();
cam_cell.reset();
sleep_tx.reset();
scaling_factor.reset();
wire_local.reset();
wire_inside_mat.reset();
wire_outside_mat.reset();
sram.reset();
dram.reset();
cam.reset();
}
};
//end ali
class DynamicParameter
{
public:
bool is_tag;
bool pure_ram;
bool pure_cam;
bool fully_assoc;
int tagbits;
int num_subarrays; // only for leakage computation -- the number of subarrays per bank
int num_mats; // only for leakage computation -- the number of mats per bank
double Nspd;
int Ndwl;
int Ndbl;
int Ndcm;
int deg_bl_muxing;
int deg_senseamp_muxing_non_associativity;
int Ndsam_lev_1;
int Ndsam_lev_2;
Wire_type wtype; // merge from cacti-7 code to cacti3d code.
int number_addr_bits_mat; // per port
int number_subbanks_decode; // per_port
int num_di_b_bank_per_port;
int num_do_b_bank_per_port;
int num_di_b_mat;
int num_do_b_mat;
int num_di_b_subbank;
int num_do_b_subbank;
int num_si_b_mat;
int num_so_b_mat;
int num_si_b_subbank;
int num_so_b_subbank;
int num_si_b_bank_per_port;
int num_so_b_bank_per_port;
int number_way_select_signals_mat;
int num_act_mats_hor_dir;
int num_act_mats_hor_dir_sl;
bool is_dram;
double V_b_sense;
unsigned int num_r_subarray;
unsigned int num_c_subarray;
int tag_num_r_subarray;//: fully associative cache tag and data must be computed together, data and tag must be separate
int tag_num_c_subarray;
int data_num_r_subarray;
int data_num_c_subarray;
int num_mats_h_dir;
int num_mats_v_dir;
uint32_t ram_cell_tech_type;
double dram_refresh_period;
DynamicParameter();
DynamicParameter(
bool is_tag_,
int pure_ram_,
int pure_cam_,
double Nspd_,
unsigned int Ndwl_,
unsigned int Ndbl_,
unsigned int Ndcm_,
unsigned int Ndsam_lev_1_,
unsigned int Ndsam_lev_2_,
Wire_type wt, // merged from cacti-7 to cacti3d
bool is_main_mem_);
int use_inp_params;
unsigned int num_rw_ports;
unsigned int num_rd_ports;
unsigned int num_wr_ports;
unsigned int num_se_rd_ports; // number of single ended read ports
unsigned int num_search_ports;
unsigned int out_w;// == nr_bits_out
bool is_main_mem;
Area cell, cam_cell;//cell is the sram_cell in both nomal cache/ram and FA.
bool is_valid;
private:
void ECC_adjustment();
void init_CAM();
void init_FA();
bool calc_subarr_rc(unsigned int cap); //to calculate and check subarray rows and columns
};
extern InputParameter * g_ip;
extern TechnologyParameter g_tp;
#endif

View file

@ -0,0 +1,129 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "area.h"
#include "powergating.h"
#include "parameter.h"
#include <iostream>
#include <math.h>
#include <assert.h>
using namespace std;
//TODO: although DTSN is used,since for memory array, the number of sleep txs
//is related to the number of rows and cols. so All calculations are still base on
//single sleep tx cases
Sleep_tx::Sleep_tx(
double _perf_with_sleep_tx,
double _active_Isat,//of circuit block, not sleep tx
bool _is_footer,
double _c_circuit_wakeup,
double _V_delta,
int _num_sleep_tx,
// double _vt_circuit,
// double _vt_sleep_tx,
// double _mobility,//of sleep tx
// double _c_ox,//of sleep tx
const Area & cell_)
:perf_with_sleep_tx(_perf_with_sleep_tx),
active_Isat(_active_Isat),
is_footer(_is_footer),
c_circuit_wakeup(_c_circuit_wakeup),
V_delta(_V_delta),
num_sleep_tx(_num_sleep_tx),
// vt_circuit(_vt_circuit),
// vt_sleep_tx(_vt_sleep_tx),
// mobility(_mobility),
// c_ox(_c_ox)
cell(cell_),
is_sleep_tx(true)
{
//a single sleep tx in a network
double raw_area, raw_width, raw_hight;
double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(false, false, true);
vdd = g_tp.peri_global.Vdd;
vt_circuit = g_tp.peri_global.Vth;
vt_sleep_tx = g_tp.sleep_tx.Vth;
mobility = g_tp.sleep_tx.Mobility_n;
c_ox = g_tp.sleep_tx.C_ox;
width = active_Isat/(perf_with_sleep_tx*mobility*c_ox*(vdd-vt_circuit)*(vdd-vt_sleep_tx))*g_ip->F_sz_um;//W/L uses physical numbers
width /= num_sleep_tx;
raw_area = compute_gate_area(INV, 1, width, p_to_n_sz_ratio*width, cell.w*2)/2; //Only single device, assuming device is laide on the side
raw_width = cell.w;
raw_hight = raw_area/cell.w;
area.set_h(raw_hight);
area.set_w(raw_width);
compute_penalty();
}
double Sleep_tx::compute_penalty()
{
//V_delta = VDD - VCCmin nothing to do with threshold of sleep tx. Although it might be OK to use sleep tx to control the V_delta
// double c_load;
double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(false, false, true);
if (is_footer)
{
c_intrinsic_sleep = drain_C_(width, NCH, 1, 1, area.h, false, false, false,is_sleep_tx);
// V_delta = _V_delta;
wakeup_delay = (c_circuit_wakeup + c_intrinsic_sleep)*V_delta/(simplified_nmos_Isat(width, false, false, false,is_sleep_tx)/Ilinear_to_Isat_ratio);
wakeup_power.readOp.dynamic = (c_circuit_wakeup + c_intrinsic_sleep)*g_tp.sram_cell.Vdd*V_delta;
//no 0.5 because the half of the energy spend in entering sleep and half of the energy will be spent in waking up. And they are pairs
}
else
{
c_intrinsic_sleep = drain_C_(width*p_to_n_sz_ratio, PCH, 1, 1, area.h, false, false, false,is_sleep_tx);
// V_delta = _V_delta;
wakeup_delay = (c_circuit_wakeup + c_intrinsic_sleep)*V_delta/(simplified_pmos_Isat(width, false, false, false,is_sleep_tx)/Ilinear_to_Isat_ratio);
wakeup_power.readOp.dynamic = (c_circuit_wakeup + c_intrinsic_sleep)*g_tp.sram_cell.Vdd*V_delta;
}
return wakeup_delay;
/*
The number of cycles in the wake-up latency set the constraint on the
minimum number of idle clock cycles needed before a processor
can enter in the corresponding sleep mode without any wakeup
overhead.
If the circuit is half way to sleep then waken up, it is still OK
just the wakeup latency will be shorter than the wakeup time from full asleep.
So, the sleep time and energy does not matter
*/
}

View file

@ -0,0 +1,86 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef POWERGATING_H_
#define POWERGATING_H_
#include "component.h"
class Sleep_tx : public Component
{
public:
Sleep_tx(
double _perf_with_sleep_tx,
double _active_Isat,//of circuit block, not sleep tx
bool _is_footer,
double _c_circuit_wakeup,
double _V_delta,
int _num_sleep_tx,
// double _vt_circuit,
// double _vt_sleep_tx,
// double _mobility,//of sleep tx
// double _c_ox,//of sleep tx
const Area & cell_);
double perf_with_sleep_tx;
double active_Isat;
bool is_footer;
double vt_circuit;
double vt_sleep_tx;
double vdd;// of circuit block not sleep tx
double mobility;//of sleep tx
double c_ox;
double width;
double c_circuit_wakeup;
double c_intrinsic_sleep;
double delay, wakeup_delay;
powerDef power, wakeup_power;
// double c_circuit_sleep;
// double sleep_delay;
// powerDef sleep_power;
double V_delta;
int num_sleep_tx;
const Area & cell;
bool is_sleep_tx;
// void compute_area();
double compute_penalty(); // return outrisetime
void leakage_feedback(double temperature){};
~Sleep_tx(){};
};
#endif /* POWERGATING_H_ */

View file

@ -0,0 +1,45 @@
cache 4 types
./cacti -infile test_configs/cache1.cfg #L1 2-way 32K
./cacti -infile test_configs/cache2.cfg #L2 4-way 256K
./cacti -infile test_configs/cache3.cfg #L3 8-way 16M
./cacti -infile test_configs/cache4.cfg #L1 full-asso 4K with single search port
RAM 4 types
./cacti -infile test_configs/ram1.cfg # 16M
./cacti -infile test_configs/ram2.cfg # itrs-hp itrs-lstp
./cacti -infile test_configs/ram3.cfg # two banks no-ecc 128M
./cacti -infile test_configs/ram4.cfg # 32K 2-way
CAM 4 types
./cacti -infile test_configs/cam1.cfg # same as ram1 but ram->cam and full-asso
./cacti -infile test_configs/cam2.cfg # same as cam1 with line size = 128
./cacti -infile test_configs/cam3.cfg # cam1 for 40nm technology
./cacti -infile test_configs/cam4.cfg # ca1 with exclusive read and write port
NUCA 4 types
./cacti -infile test_configs/nuca1.cfg #
./cacti -infile test_configs/nuca2.cfg
./cacti -infile test_configs/nuca3.cfg
./cacti -infile test_configs/nuca3.cfg
eDRAM 4 types
./cacti -infile test_configs/edram1.cfg #
./cacti -infile test_configs/edram2.cfg
./cacti -infile test_configs/edram3.cfg
./cacti -infile test_configs/edram4.cfg
DRAM 4 types
./cacti -infile test_configs/dram1.cfg #
./cacti -infile test_configs/dram2.cfg
./cacti -infile test_configs/dram3.cfg
./cacti -infile test_configs/dram4.cfg
IO 4 different parameters
./cacti -infile test_configs/io1.cfg #
./cacti -infile test_configs/io2.cfg
./cacti -infile test_configs/io3.cfg
./cacti -infile test_configs/io4.cfg
Power gating 4 types
./cacti -infile test_configs/power_gate1.cfg
./cacti -infile test_configs/power_gate2.cfg
./cacti -infile test_configs/power_gate3.cfg
./cacti -infile test_configs/power_gate4.cfg
3D 4 types
./cacti -infile test_configs/3D1.cfg
./cacti -infile test_configs/3D2.cfg
./cacti -infile test_configs/3D3.cfg
./cacti -infile test_configs/3D4.cfg

311
T1/TP/TP1/cacti_7/router.cc Normal file
View file

@ -0,0 +1,311 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#include "router.h"
Router::Router(
double flit_size_,
double vc_buf, /* vc size = vc_buffer_size * flit_size */
double vc_c,
/*TechnologyParameter::*/DeviceType *dt,
double I_,
double O_,
double M_
):flit_size(flit_size_),
deviceType(dt),
I(I_),
O(O_),
M(M_)
{
vc_buffer_size = vc_buf;
vc_count = vc_c;
min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
double technology = g_ip->F_sz_um;
Vdd = dt->Vdd;
/*Crossbar parameters. Transmisson gate is employed for connector*/
NTtr = 10*technology*1e-6/2; /*Transmission gate's nmos tr. length*/
PTtr = 20*technology*1e-6/2; /* pmos tr. length*/
wt = 15*technology*1e-6/2; /*track width*/
ht = 15*technology*1e-6/2; /*track height*/
// I = 5; /*Number of crossbar input ports*/
// O = 5; /*Number of crossbar output ports*/
NTi = 12.5*technology*1e-6/2;
PTi = 25*technology*1e-6/2;
NTid = 60*technology*1e-6/2; //m
PTid = 120*technology*1e-6/2; // m
NTod = 60*technology*1e-6/2; // m
PTod = 120*technology*1e-6/2; // m
calc_router_parameters();
}
Router::~Router(){}
double //wire cap with triple spacing
Router::Cw3(double length) {
Wire wc(g_ip->wt, length, 1, 3, 3);
return (wc.wire_cap(length));
}
/*Function to calculate the gate capacitance*/
double
Router::gate_cap(double w) {
return (double) gate_C (w*1e6 /*u*/, 0);
}
/*Function to calculate the diffusion capacitance*/
double
Router::diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/,
double s /*number of stacking transistors*/) {
return (double) drain_C_(w*1e6 /*u*/, type, (int) s, 1, g_tp.cell_h_def);
}
/*crossbar related functions */
// Model for simple transmission gate
double
Router::transmission_buf_inpcap() {
return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
}
double
Router::transmission_buf_outcap() {
return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
}
double
Router::transmission_buf_ctrcap() {
return gate_cap(NTtr)+gate_cap(PTtr);
}
double
Router::crossbar_inpline() {
return (Cw3(O*flit_size*wt) + O*transmission_buf_inpcap() + gate_cap(NTid) +
gate_cap(PTid) + diff_cap(NTid, 0, 1) + diff_cap(PTid, 1, 1));
}
double
Router::crossbar_outline() {
return (Cw3(I*flit_size*ht) + I*transmission_buf_outcap() + gate_cap(NTod) +
gate_cap(PTod) + diff_cap(NTod, 0, 1) + diff_cap(PTod, 1, 1));
}
double
Router::crossbar_ctrline() {
return (Cw3(0.5*O*flit_size*wt) + flit_size*transmission_buf_ctrcap() +
diff_cap(NTi, 0, 1) + diff_cap(PTi, 1, 1) +
gate_cap(NTi) + gate_cap(PTi));
}
double
Router::tr_crossbar_power() {
return (crossbar_inpline()*Vdd*Vdd*flit_size/2 +
crossbar_outline()*Vdd*Vdd*flit_size/2)*2;
}
void Router::buffer_stats()
{
DynamicParameter dyn_p;
dyn_p.is_tag = false;
dyn_p.pure_cam = false;
dyn_p.fully_assoc = false;
dyn_p.pure_ram = true;
dyn_p.is_dram = false;
dyn_p.is_main_mem = false;
dyn_p.num_subarrays = 1;
dyn_p.num_mats = 1;
dyn_p.Ndbl = 1;
dyn_p.Ndwl = 1;
dyn_p.Nspd = 1;
dyn_p.deg_bl_muxing = 1;
dyn_p.deg_senseamp_muxing_non_associativity = 1;
dyn_p.Ndsam_lev_1 = 1;
dyn_p.Ndsam_lev_2 = 1;
dyn_p.Ndcm = 1;
dyn_p.number_addr_bits_mat = 8;
dyn_p.number_way_select_signals_mat = 1;
dyn_p.number_subbanks_decode = 0;
dyn_p.num_act_mats_hor_dir = 1;
dyn_p.V_b_sense = Vdd; // FIXME check power calc.
dyn_p.ram_cell_tech_type = 0;
dyn_p.num_r_subarray = (int) vc_buffer_size;
dyn_p.num_c_subarray = (int) flit_size * (int) vc_count;
dyn_p.num_mats_h_dir = 1;
dyn_p.num_mats_v_dir = 1;
dyn_p.num_do_b_subbank = (int)flit_size;
dyn_p.num_di_b_subbank = (int)flit_size;
dyn_p.num_do_b_mat = (int) flit_size;
dyn_p.num_di_b_mat = (int) flit_size;
dyn_p.num_do_b_mat = (int) flit_size;
dyn_p.num_di_b_mat = (int) flit_size;
dyn_p.num_do_b_bank_per_port = (int) flit_size;
dyn_p.num_di_b_bank_per_port = (int) flit_size;
dyn_p.out_w = (int) flit_size;
dyn_p.use_inp_params = 1;
dyn_p.num_wr_ports = (unsigned int) vc_count;
dyn_p.num_rd_ports = 1;//(unsigned int) vc_count;//based on Bill Dally's book
dyn_p.num_rw_ports = 0;
dyn_p.num_se_rd_ports =0;
dyn_p.num_search_ports =0;
dyn_p.cell.h = g_tp.sram.b_h + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_wr_ports +
dyn_p.num_rw_ports - 1 + dyn_p.num_rd_ports);
dyn_p.cell.w = g_tp.sram.b_w + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_rw_ports - 1 +
(dyn_p.num_rd_ports - dyn_p.num_se_rd_ports) +
dyn_p.num_wr_ports) + g_tp.wire_outside_mat.pitch * dyn_p.num_se_rd_ports;
Mat buff(dyn_p);
buff.compute_delays(0);
buff.compute_power_energy();
buffer.power.readOp = buff.power.readOp;
buffer.power.writeOp = buffer.power.readOp; //FIXME
buffer.area = buff.area;
}
void
Router::cb_stats ()
{
if (1) {
Crossbar c_b(I, O, flit_size);
c_b.compute_power();
crossbar.delay = c_b.delay;
crossbar.power.readOp.dynamic = c_b.power.readOp.dynamic;
crossbar.power.readOp.leakage = c_b.power.readOp.leakage;
crossbar.power.readOp.gate_leakage = c_b.power.readOp.gate_leakage;
crossbar.area = c_b.area;
// c_b.print_crossbar();
}
else {
crossbar.power.readOp.dynamic = tr_crossbar_power();
crossbar.power.readOp.leakage = flit_size * I * O *
cmos_Isub_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
crossbar.power.readOp.gate_leakage = flit_size * I * O *
cmos_Ig_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
}
}
void
Router::get_router_power()
{
/* calculate buffer stats */
buffer_stats();
/* calculate cross-bar stats */
cb_stats();
/* calculate arbiter stats */
Arbiter vcarb(vc_count, flit_size, buffer.area.w);
Arbiter cbarb(I, flit_size, crossbar.area.w);
vcarb.compute_power();
cbarb.compute_power();
arbiter.power.readOp.dynamic = vcarb.power.readOp.dynamic * I +
cbarb.power.readOp.dynamic * O;
arbiter.power.readOp.leakage = vcarb.power.readOp.leakage * I +
cbarb.power.readOp.leakage * O;
arbiter.power.readOp.gate_leakage = vcarb.power.readOp.gate_leakage * I +
cbarb.power.readOp.gate_leakage * O;
// arb_stats();
power.readOp.dynamic = ((buffer.power.readOp.dynamic+buffer.power.writeOp.dynamic) +
crossbar.power.readOp.dynamic +
arbiter.power.readOp.dynamic)*MIN(I, O)*M;
double pppm_t[4] = {1,I,I,1};
power = power + (buffer.power*pppm_t + crossbar.power + arbiter.power)*pppm_lkg;
}
void
Router::get_router_delay ()
{
FREQUENCY=5; // move this to config file --TODO
cycle_time = (1/(double)FREQUENCY)*1e3; //ps
delay = 4;
max_cyc = 17 * g_tp.FO4; //s
max_cyc *= 1e12; //ps
if (cycle_time < max_cyc) {
FREQUENCY = (1/max_cyc)*1e3; //GHz
}
}
void
Router::get_router_area()
{
area.h = I*buffer.area.h;
area.w = buffer.area.w+crossbar.area.w;
}
void
Router::calc_router_parameters()
{
/* calculate router frequency and pipeline cycles */
get_router_delay();
/* router power stats */
get_router_power();
/* area stats */
get_router_area();
}
void
Router::print_router()
{
cout << "\n\nRouter stats:\n";
cout << "\tRouter Area - "<< area.get_area()*1e-6<<"(mm^2)\n";
cout << "\tMaximum possible network frequency - " << (1/max_cyc)*1e3 << "GHz\n";
cout << "\tNetwork frequency - " << FREQUENCY <<" GHz\n";
cout << "\tNo. of Virtual channels - " << vc_count << "\n";
cout << "\tNo. of pipeline stages - " << delay << endl;
cout << "\tLink bandwidth - " << flit_size << " (bits)\n";
cout << "\tNo. of buffer entries per virtual channel - "<< vc_buffer_size << "\n";
cout << "\tSimple buffer Area - "<< buffer.area.get_area()*1e-6<<"(mm^2)\n";
cout << "\tSimple buffer access (Read) - " << buffer.power.readOp.dynamic * 1e9 <<" (nJ)\n";
cout << "\tSimple buffer leakage - " << buffer.power.readOp.leakage * 1e3 <<" (mW)\n";
cout << "\tCrossbar Area - "<< crossbar.area.get_area()*1e-6<<"(mm^2)\n";
cout << "\tCross bar access energy - " << crossbar.power.readOp.dynamic * 1e9<<" (nJ)\n";
cout << "\tCross bar leakage power - " << crossbar.power.readOp.leakage * 1e3<<" (mW)\n";
cout << "\tArbiter access energy (VC arb + Crossbar arb) - "<<arbiter.power.readOp.dynamic * 1e9 <<" (nJ)\n";
cout << "\tArbiter leakage (VC arb + Crossbar arb) - "<<arbiter.power.readOp.leakage * 1e3 <<" (mW)\n";
}

115
T1/TP/TP1/cacti_7/router.h Normal file
View file

@ -0,0 +1,115 @@
/*****************************************************************************
* CACTI 7.0
* SOFTWARE LICENSE AGREEMENT
* Copyright 2015 Hewlett-Packard Development Company, L.P.
* All Rights Reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
***************************************************************************/
#ifndef __ROUTER_H__
#define __ROUTER_H__
#include <assert.h>
#include <iostream>
#include "basic_circuit.h"
#include "cacti_interface.h"
#include "component.h"
#include "mat.h"
#include "parameter.h"
#include "wire.h"
#include "crossbar.h"
#include "arbiter.h"
class Router : public Component
{
public:
Router(
double flit_size_,
double vc_buf, /* vc size = vc_buffer_size * flit_size */
double vc_count,
/*TechnologyParameter::*/DeviceType *dt = &(g_tp.peri_global),
double I_ = 5,
double O_ = 5,
double M_ = 0.6);
~Router();
void print_router();
Component arbiter, crossbar, buffer;
double cycle_time, max_cyc;
double flit_size;
double vc_count;
double vc_buffer_size; /* vc size = vc_buffer_size * flit_size */
private:
/*TechnologyParameter::*/DeviceType *deviceType;
double FREQUENCY; // move this to config file --TODO
double Cw3(double len);
double gate_cap(double w);
double diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/, double stack);
enum Wire_type wtype;
enum Wire_placement wire_placement;
//corssbar
double NTtr, PTtr, wt, ht, I, O, NTi, PTi, NTid, PTid, NTod, PTod, TriS1, TriS2;
double M; //network load
double transmission_buf_inpcap();
double transmission_buf_outcap();
double transmission_buf_ctrcap();
double crossbar_inpline();
double crossbar_outline();
double crossbar_ctrline();
double tr_crossbar_power();
void cb_stats ();
double arb_power();
void arb_stats ();
double buffer_params();
void buffer_stats();
//arbiter
//buffer
//router params
double Vdd;
void calc_router_parameters();
void get_router_area();
void get_router_power();
void get_router_delay();
double min_w_pmos;
};
#endif

View file

@ -0,0 +1,259 @@
# Cache size
//-size (bytes) 2048
//-size (bytes) 4096
//-size (bytes) 32768
//-size (bytes) 131072
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
# power gating
-Array Power Gating - "false"
-WL Power Gating - "false"
-CL Power Gating - "false"
-Bitline floating - "false"
-Interconnect Power Gating - "false"
-Power Gating Performance Loss 0.01
# Line size
//-block size (bytes) 8
-block size (bytes) 64
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
//-associativity 8
-associativity 8
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 1
-technology (u) 0.022
//-technology (u) 0.040
//-technology (u) 0.032
//-technology (u) 0.090
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
//-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
//-Tag array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
//-Tag array peripheral type - "itrs-lop
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
-output/input bus width 512
// 300-400 in steps of 10
-operating temperature (K) 360
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 22
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
//-Wire outside mat - "global"
-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
#### Default CONFIGURATION values for baseline external IO parameters to DRAM.
# Memory Type (D=DDR3, L=LPDDR2, W=WideIO, S=Low-swing differential)
-dram_type "D"
//-dram_type "L"
//-dram_type "W"
//-dram_type "S"
# Memory State (R=Read, W=Write, I=Idle or S=Sleep)
//-iostate "R"
-iostate "W"
//-iostate "I"
//-iostate "S"
# Is ECC Enabled (Y=Yes, N=No)
-dram_ecc "Y"
#Address bus timing
//-addr_timing 0.5 //DDR, for LPDDR2 and LPDDR3
-addr_timing 1.0 //SDR for DDR3, Wide-IO
//-addr_timing 2.0 //2T timing
//addr_timing 3.0 // 3T timing
# Bandwidth (Gbytes per second, this is the effective bandwidth)
-bus_bw 12.8 GBps //Valid range 0 to 2*bus_freq*num_dq
# Memory Density (Gbit per memory/DRAM die)
-mem_density 4 Gb //Valid values 2^n Gb
# IO frequency (MHz) (frequency of the external memory interface).
-bus_freq 800 MHz //Valid range 0 to 1.5 GHz for DDR3, 0 to 1.2 GHz for LPDDR3, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential
# Duty Cycle (fraction of time in the Memory State defined above)
-duty_cycle 1.0 //Valid range 0 to 1.0
# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_ca 0.5 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Number of DQ pins
-num_dq 72 //Include ECC pins as well (if present). If ECC pins are included, the bus bandwidth is 2*(num_dq-#of ECC pins)*bus_freq. Valid range 0 to 72.
# Number of DQS pins
-num_dqs 18 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.
# Number of CA pins
-num_ca 25 //Valid range 0 to 35 pins.
# Number of CLK pins
-num_clk 2 //2 x differential pair. Valid values: 0/2/4.
# Number of Physical Ranks
-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per DIMM or buffer chip
# Width of the Memory Data Bus
-mem_data_width 8 //x4 or x8 or x16 or x32 or x128 memories

View file

@ -0,0 +1,259 @@
# Cache size
//-size (bytes) 2048
//-size (bytes) 4096
//-size (bytes) 32768
//-size (bytes) 131072
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
# power gating
-Array Power Gating - "false"
-WL Power Gating - "false"
-CL Power Gating - "false"
-Bitline floating - "false"
-Interconnect Power Gating - "false"
-Power Gating Performance Loss 0.01
# Line size
//-block size (bytes) 8
-block size (bytes) 64
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
//-associativity 8
-associativity 8
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 1
-technology (u) 0.022
//-technology (u) 0.040
//-technology (u) 0.032
//-technology (u) 0.090
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
//-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
//-Tag array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
//-Tag array peripheral type - "itrs-lop
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
-output/input bus width 512
// 300-400 in steps of 10
-operating temperature (K) 360
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 22
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
//-Wire outside mat - "global"
-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
#### Default CONFIGURATION values for baseline external IO parameters to DRAM.
# Memory Type (D=DDR3, L=LPDDR2, W=WideIO, S=Low-swing differential)
//-dram_type "D"
//-dram_type "L"
//-dram_type "W"
-dram_type "S"
# Memory State (R=Read, W=Write, I=Idle or S=Sleep)
//-iostate "R"
-iostate "W"
//-iostate "I"
//-iostate "S"
# Is ECC Enabled (Y=Yes, N=No)
-dram_ecc "N"
#Address bus timing
//-addr_timing 0.5 //DDR, for LPDDR2 and LPDDR3
-addr_timing 1.0 //SDR for DDR3, Wide-IO
//-addr_timing 2.0 //2T timing
//addr_timing 3.0 // 3T timing
# Bandwidth (Gbytes per second, this is the effective bandwidth)
-bus_bw 6 GBps //Valid range 0 to 2*bus_freq*num_dq
# Memory Density (Gbit per memory/DRAM die)
-mem_density 4 Gb //Valid values 2^n Gb
# IO frequency (MHz) (frequency of the external memory interface).
-bus_freq 3000 MHz //Valid range 0 to 1.5 GHz for DDR3, 0 to 1.2 GHz for LPDDR3, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential
# Duty Cycle (fraction of time in the Memory State defined above)
-duty_cycle 1.0 //Valid range 0 to 1.0
# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_ca 0.5 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Number of DQ pins
-num_dq 8 //Include ECC pins as well (if present). If ECC pins are included, the bus bandwidth is 2*(num_dq-#of ECC pins)*bus_freq. Valid range 0 to 72.
# Number of DQS pins
-num_dqs 2 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.
# Number of CA pins
-num_ca 0 //Valid range 0 to 35 pins.
# Number of CLK pins
-num_clk 0 //2 x differential pair. Valid values: 0/2/4.
# Number of Physical Ranks
-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per DIMM or buffer chip
# Width of the Memory Data Bus
-mem_data_width 8 //x4 or x8 or x16 or x32 memories

View file

@ -0,0 +1,259 @@
# Cache size
//-size (bytes) 2048
//-size (bytes) 4096
//-size (bytes) 32768
//-size (bytes) 131072
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
# power gating
-Array Power Gating - "false"
-WL Power Gating - "false"
-CL Power Gating - "false"
-Bitline floating - "false"
-Interconnect Power Gating - "false"
-Power Gating Performance Loss 0.01
# Line size
//-block size (bytes) 8
-block size (bytes) 64
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
//-associativity 8
-associativity 8
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 1
-technology (u) 0.022
//-technology (u) 0.040
//-technology (u) 0.032
//-technology (u) 0.090
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
//-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
//-Tag array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
//-Tag array peripheral type - "itrs-lop
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
-output/input bus width 512
// 300-400 in steps of 10
-operating temperature (K) 360
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 22
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
//-Wire outside mat - "global"
-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
#### Default CONFIGURATION values for baseline external IO parameters to DRAM.
# Memory Type (D=DDR3, L=LPDDR2, W=WideIO, S=Low-swing differential)
//-dram_type "D"
-dram_type "L"
//-dram_type "W"
//-dram_type "S"
# Memory State (R=Read, W=Write, I=Idle or S=Sleep)
//-iostate "R"
-iostate "W"
//-iostate "I"
//-iostate "S"
# Is ECC Enabled (Y=Yes, N=No)
-dram_ecc "N"
#Address bus timing
-addr_timing 0.5 //DDR, for LPDDR2 and LPDDR3
//-addr_timing 1.0 //SDR for DDR3, Wide-IO
//-addr_timing 2.0 //2T timing
//addr_timing 3.0 // 3T timing
# Bandwidth (Gbytes per second, this is the effective bandwidth)
-bus_bw 6.4 GBps //Valid range 0 to 2*bus_freq*num_dq
# Memory Density (Gbit per memory/DRAM die)
-mem_density 4 Gb //Valid values 2^n Gb
# IO frequency (MHz) (frequency of the external memory interface).
-bus_freq 800 MHz //Valid range 0 to 1.5 GHz for DDR3, 0 to 1.2 GHz for LPDDR3, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential
# Duty Cycle (fraction of time in the Memory State defined above)
-duty_cycle 1.0 //Valid range 0 to 1.0
# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_ca 0.5 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Number of DQ pins
-num_dq 32 //Include ECC pins as well (if present). If ECC pins are included, the bus bandwidth is 2*(num_dq-#of ECC pins)*bus_freq. Valid range 0 to 72.
# Number of DQS pins
-num_dqs 8 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.
# Number of CA pins
-num_ca 14 //Valid range 0 to 35 pins.
# Number of CLK pins
-num_clk 2 //2 x differential pair. Valid values: 0/2/4.
# Number of Physical Ranks
-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per DIMM or buffer chip
# Width of the Memory Data Bus
-mem_data_width 32 //x4 or x8 or x16 or x32 or x128 memories

View file

@ -0,0 +1,259 @@
# Cache size
//-size (bytes) 2048
//-size (bytes) 4096
//-size (bytes) 32768
//-size (bytes) 131072
//-size (bytes) 262144
//-size (bytes) 1048576
//-size (bytes) 2097152
//-size (bytes) 4194304
-size (bytes) 8388608
//-size (bytes) 16777216
//-size (bytes) 33554432
//-size (bytes) 134217728
//-size (bytes) 67108864
//-size (bytes) 1073741824
# power gating
-Array Power Gating - "false"
-WL Power Gating - "false"
-CL Power Gating - "false"
-Bitline floating - "false"
-Interconnect Power Gating - "false"
-Power Gating Performance Loss 0.01
# Line size
//-block size (bytes) 8
-block size (bytes) 64
# To model Fully Associative cache, set associativity to zero
//-associativity 0
//-associativity 2
//-associativity 4
//-associativity 8
-associativity 8
-read-write port 1
-exclusive read port 0
-exclusive write port 0
-single ended read ports 0
# Multiple banks connected using a bus
-UCA bank count 1
-technology (u) 0.022
//-technology (u) 0.040
//-technology (u) 0.032
//-technology (u) 0.090
# following three parameters are meaningful only for main memories
-page size (bits) 8192
-burst length 8
-internal prefetch width 8
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Data array cell type - "itrs-hp"
//-Data array cell type - "itrs-lstp"
//-Data array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Data array peripheral type - "itrs-hp"
//-Data array peripheral type - "itrs-lstp"
//-Data array peripheral type - "itrs-lop"
# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
-Tag array cell type - "itrs-hp"
//-Tag array cell type - "itrs-lstp"
//-Tag array cell type - "itrs-lop"
# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
-Tag array peripheral type - "itrs-hp"
//-Tag array peripheral type - "itrs-lstp"
//-Tag array peripheral type - "itrs-lop
# Bus width include data bits and address bits required by the decoder
//-output/input bus width 16
-output/input bus width 512
// 300-400 in steps of 10
-operating temperature (K) 360
# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file)
# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
-cache type "cache"
//-cache type "ram"
//-cache type "main memory"
# to model special structure like branch target buffers, directory, etc.
# change the tag size parameter
# if you want cacti to calculate the tagbits, set the tag size to "default"
-tag size (b) "default"
//-tag size (b) 22
# fast - data and tag access happen in parallel
# sequential - data array is accessed after accessing the tag array
# normal - data array lookup and tag access happen in parallel
# final data block is broadcasted in data array h-tree
# after getting the signal from the tag array
//-access mode (normal, sequential, fast) - "fast"
-access mode (normal, sequential, fast) - "normal"
//-access mode (normal, sequential, fast) - "sequential"
# DESIGN OBJECTIVE for UCA (or banks in NUCA)
-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
# Percentage deviation from the minimum value
# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
# that compromises at most 10% delay.
# NOTE: Try reasonable values for % deviation. Inconsistent deviation
# percentage values will not produce any valid organizations. For example,
# 0:0:100:100:100 will try to identify an organization that has both
# least delay and dynamic power. Since such an organization is not possible, CACTI will
# throw an error. Refer CACTI-6 Technical report for more details
-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
# Objective for NUCA
-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
# energy-delay or energy-delay sq. product
# Note: Optimize tag will disable weight or deviate values mentioned above
# Set it to NONE to let weight and deviate values determine the
# appropriate cache configuration
//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
-Cache model (NUCA, UCA) - "UCA"
//-Cache model (NUCA, UCA) - "NUCA"
# In order for CACTI to find the optimal NUCA bank value the following
# variable should be assigned 0.
-NUCA bank count 0
# NOTE: for nuca network frequency is set to a default value of
# 5GHz in time.c. CACTI automatically
# calculates the maximum possible frequency and downgrades this value if necessary
# By default CACTI considers both full-swing and low-swing
# wires to find an optimal configuration. However, it is possible to
# restrict the search space by changing the signaling from "default" to
# "fullswing" or "lowswing" type.
-Wire signaling (fullswing, lowswing, default) - "Global_30"
//-Wire signaling (fullswing, lowswing, default) - "default"
//-Wire signaling (fullswing, lowswing, default) - "lowswing"
//-Wire inside mat - "global"
-Wire inside mat - "semi-global"
//-Wire outside mat - "global"
-Wire outside mat - "semi-global"
-Interconnect projection - "conservative"
//-Interconnect projection - "aggressive"
# Contention in network (which is a function of core count and cache level) is one of
# the critical factor used for deciding the optimal bank count value
# core count can be 4, 8, or 16
//-Core count 4
-Core count 8
//-Core count 16
-Cache level (L2/L3) - "L3"
-Add ECC - "true"
//-Print level (DETAILED, CONCISE) - "CONCISE"
-Print level (DETAILED, CONCISE) - "DETAILED"
# for debugging
//-Print input parameters - "true"
-Print input parameters - "false"
# force CACTI to model the cache with the
# following Ndbl, Ndwl, Nspd, Ndsam,
# and Ndcm values
//-Force cache config - "true"
-Force cache config - "false"
-Ndwl 1
-Ndbl 1
-Nspd 0
-Ndcm 1
-Ndsam1 0
-Ndsam2 0
#### Default CONFIGURATION values for baseline external IO parameters to DRAM.
# Memory Type (D=DDR3, L=LPDDR2, W=WideIO, S=Low-swing differential)
//-dram_type "D"
//-dram_type "L"
-dram_type "W"
//-dram_type "S"
# Memory State (R=Read, W=Write, I=Idle or S=Sleep)
//-iostate "R"
-iostate "W"
//-iostate "I"
//-iostate "S"
# Is ECC Enabled (Y=Yes, N=No)
-dram_ecc "N"
#Address bus timing
//-addr_timing 0.5 //DDR, for LPDDR2 and LPDDR3
-addr_timing 1.0 //SDR for DDR3, Wide-IO
//-addr_timing 2.0 //2T timing
//addr_timing 3.0 // 3T timing
# Bandwidth (Gbytes per second, this is the effective bandwidth)
-bus_bw 12.8 GBps //Valid range 0 to 2*bus_freq*num_dq
# Memory Density (Gbit per memory/DRAM die)
-mem_density 4 Gb //Valid values 2^n Gb
# IO frequency (MHz) (frequency of the external memory interface).
-bus_freq 400 MHz //Valid range 0 to 1.5 GHz for DDR3, 0 to 1.2 GHz for LPDDR3, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential
# Duty Cycle (fraction of time in the Memory State defined above)
-duty_cycle 1.0 //Valid range 0 to 1.0
# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)
-activity_ca 0.5 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR
# Number of DQ pins
-num_dq 128 //Include ECC pins as well (if present). If ECC pins are included, the bus bandwidth is 2*(num_dq-#of ECC pins)*bus_freq. Valid range 0 to 72.
# Number of DQS pins
-num_dqs 16 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.
# Number of CA pins
-num_ca 30 //Valid range 0 to 35 pins.
# Number of CLK pins
-num_clk 2 //2 x differential pair. Valid values: 0/2/4.
# Number of Physical Ranks
-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per DIMM or buffer chip
# Width of the Memory Data Bus
-mem_data_width 128 //x4 or x8 or x16 or x32 or x128 memories

Some files were not shown because too many files have changed in this diff Show more