GainSight SCALE-Sim Backend Python API Documentation

This page documents the Python scripts in gainsight/backend/python-scripts/ supporting the SCALE-Sim systolic array simulation backend using mkdocstrings. Please refer to the backend wiki for a summary of implementation details and usage instructions.

SCALE-Sim Runner Script

The run.py script is the main entry point for running the SCALE-Sim simulator. It handles command-line arguments, initializes the simulation environment, and manages the execution of the simulation.

`get_aggregate(lifetime_csv_file, freq_data, aggregate_csv_file)`

Parse lifetimes CSV and frequency data to compute aggregate statistics per data type.

Parameters:

Name	Type	Description	Default
`lifetime_csv_file`	`str`	Path to the CSV file with address-level lifetimes.	required
`freq_data`	`dict`	Dictionary with frequency and access statistics from parse_lifetimes.	required
`aggregate_csv_file`	`str`	Path to output CSV for aggregate statistics.	required

Returns:

Type	Description
`None`	None

Source code in python/run.py

def get_aggregate(lifetime_csv_file: str, freq_data: dict, aggregate_csv_file: str) -> None:
    """Parse lifetimes CSV and frequency data to compute aggregate statistics per data type.

    Args:
        lifetime_csv_file (str): Path to the CSV file with address-level lifetimes.
        freq_data (dict): Dictionary with frequency and access statistics from parse_lifetimes.
        aggregate_csv_file (str): Path to output CSV for aggregate statistics.

    Returns:
        None
    """
    df = pd.read_csv(lifetime_csv_file, header=None, names=["data_type", "addr", "lifetime"])
    stats = df.groupby("data_type")["lifetime"].agg(
        mean="mean",
        median="median",
        percentile_90=lambda x: x.quantile(0.9),
        max="max"
    )
    unique_addresses = df.groupby("data_type")["addr"].nunique()
    num_lifetimes = df['data_type'].value_counts()
    aggregate_data = {
        'ifmap': [],
        'filter': [],
        'ofmap': []
    }

    for key in aggregate_data:
        aggregate_data[key].append(stats['mean'][key])
        aggregate_data[key].append(stats['median'][key])
        aggregate_data[key].append(stats['percentile_90'][key])
        aggregate_data[key].append(stats['max'][key])
        aggregate_data[key].append(freq_data[key][0])
        aggregate_data[key].append(freq_data[key][1])
        aggregate_data[key].append(freq_data[key][2])
        aggregate_data[key].append(freq_data[key][3])
        aggregate_data[key].append(unique_addresses[key])
        aggregate_data[key].append(num_lifetimes[key])

    headers = ['subdivision','lifetime avg','lifetime median','lifetime 90th%','lifetime max','read freq','write freq','num reads','num writes','unique addresses','num_lifetimes']

    with open(aggregate_csv_file, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(headers)
        for key in aggregate_data:
            new_row = [key]
            new_row += aggregate_data[key]
            writer.writerow(new_row)

    return

`main()`

Main entry point for processing SCALE-Sim traces and generating GainSight data.

Parses command-line arguments, processes each layer's memory traces to extract lifetimes, aggregates statistics, and (optionally) generates graphs for each layer.

Returns:

Type	Description
`None`	None

Source code in python/run.py

def main() -> None:
    """Main entry point for processing SCALE-Sim traces and generating GainSight data.

    Parses command-line arguments, processes each layer's memory traces to extract lifetimes,
    aggregates statistics, and (optionally) generates graphs for each layer.

    Returns:
        None
    """
    ## set up
    args = parse_args()
    src_dir = args.source

    if (args.output_dir):
        output_dir = args.output_dir
    else:
        output_dir = 'output/'

    project_name = os.path.basename(src_dir)

    ## parse all layers and write to lifetime csv files, then produce graphs
    for root, dirs, files in os.walk(src_dir):
        if (root == src_dir or 'layer' not in root):
            continue

        layer_name = os.path.basename(root)
        output_subdir = output_dir + project_name + '/' + layer_name + '/'
        lifetime_csv_file = output_subdir + layer_name + '_lifetime_data.csv' # default from parse_lifetimes.py
        aggregate_csv_file = output_subdir + layer_name + '_aggregate_data.csv'
        output_graph = output_subdir + layer_name + '_graph.png'

        if os.path.exists(output_graph): # already generated
            if skip_already_done:
                print("skipping")
                continue

        print('=' * 50)
        print('Working on', layer_name)

        if not os.path.exists(output_subdir):
            os.makedirs(output_subdir)

        ## PARSE
        freq_data = parse_lifetimes.parse_lifetimes(root, output_dir) # writes to csv

        ## AGGREGATE
        get_aggregate(lifetime_csv_file, freq_data, aggregate_csv_file) # writes to csv

        ## GRAPH
        if not skip_graph:
            fig = create_graphs.graph(lifetime_csv_file, layer_name, project_name)
            fig.savefig(output_graph)
            plt.close()

        print('# DONE ALL')

    return

`parse_args()`

Parse command-line arguments for the run.py script.

Returns:

Type	Description
`Namespace`	argparse.Namespace: Parsed arguments with source and output_dir attributes.

Source code in python/run.py

def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the run.py script.

    Returns:
        argparse.Namespace: Parsed arguments with source and output_dir attributes.
    """
    parser = argparse.ArgumentParser(prog='run.py')
    parser.add_argument('-s', '--source',  required=True,
                    help="Path to folder with layer subdirs.",
                    type=os.path.abspath)
    parser.add_argument('-o', '--output_dir')

    args = parser.parse_args()

    return args

SCALE-Sim Parser Script

The parse_lifetimes.py script is responsible for parsing the trace files generated by the SCALE-Sim simulator. It extracts relevant information about data lifetime, read and write operations, and other performance metrics from the trace files.

parse_lifetimes.py

Module for parsing SCALE-Sim memory access traces to extract data lifetimes and access statistics.

`create_access_list(filename, data_type)`

Parse a memory trace file to extract access cycles for each address.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the trace CSV file.	required
`data_type`	`str`	Type of data buffer (ifmap, filter, ofmap).	required

Returns:

Type	Description
`tuple[dict, int]`	tuple[dict, int]: Dictionary mapping addresses to access cycles, and total active cycles.

Source code in python/parse_lifetimes.py

def create_access_list(filename: str, data_type: str) -> tuple[dict, int]:
    """Parse a memory trace file to extract access cycles for each address.

    Args:
        filename (str): Path to the trace CSV file.
        data_type (str): Type of data buffer (ifmap, filter, ofmap).

    Returns:
        tuple[dict, int]: Dictionary mapping addresses to access cycles, and total active cycles.
    """
    active_cycles = 0 # for computing r/w frequency later
    accesses = {} # key=address, values=cycle numbers

    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            rw_found = False

            cycle = int(float(row[0]))
            for addr in row[1:len(row)]:
                addr_key = int(float(addr)) # gets rid of the unnecessary *.0

                # filter out false addresses (outside the range of the corresponding buffer)
                if (addr_key < 0):
                    continue
                if (data_type == 'ifmap' and (addr_key >= filter_offset or addr_key < ifmap_offset)):
                    continue
                if (data_type == 'filter' and (addr_key >= ofmap_offset or addr_key < filter_offset)):
                    continue
                if (data_type == 'ofmap' and (addr_key < ofmap_offset)):
                    continue

                rw_found = True
                if (addr_key in accesses):
                    accesses[addr_key].append(cycle)
                else:
                    accesses[addr_key] = [cycle]

            if rw_found:
                active_cycles += 1
    return accesses, active_cycles

`export_lifetimes(lifetimes, csvfile)`

Export lifetimes dictionary to a CSV file.

Parameters:

Name	Type	Description	Default
`lifetimes`	`dict`	Dictionary of (data_type, {address: [lifetimes]}) pairs.	required
`csvfile`	`str`	Path to the output CSV file.	required

Returns:

Type	Description
`None`	None

Source code in python/parse_lifetimes.py

def export_lifetimes(lifetimes: dict, csvfile: str) -> None:
    """Export lifetimes dictionary to a CSV file.

    Args:
        lifetimes (dict): Dictionary of (data_type, {address: [lifetimes]}) pairs.
        csvfile (str): Path to the output CSV file.

    Returns:
        None
    """
    with open(csvfile, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        for key in lifetimes:
            dictionary = lifetimes[key]
            for addr in dictionary:
                row = np.array([key, str(addr)])
                for lifetime in dictionary[addr]:
                    writer.writerow(np.append(row, str(lifetime)))

`find_lifetimes(reads, writes)`

Compute lifetimes for each address based on read and write access cycles.

Parameters:

Name	Type	Description	Default
`reads`	`dict`	Dictionary mapping addresses to read cycles.	required
`writes`	`dict`	Dictionary mapping addresses to write cycles.	required

Returns:

Name	Type	Description
`dict`	`dict`	Dictionary mapping addresses to lists of lifetimes (in cycles).

Source code in python/parse_lifetimes.py

def find_lifetimes(reads: dict, writes: dict) -> dict:
    """Compute lifetimes for each address based on read and write access cycles.

    Args:
        reads (dict): Dictionary mapping addresses to read cycles.
        writes (dict): Dictionary mapping addresses to write cycles.

    Returns:
        dict: Dictionary mapping addresses to lists of lifetimes (in cycles).
    """
    lifetimes = {}
    for addr in writes:
        if (addr not in reads):
            # written but never read; should not be common
            lifetimes[addr] = [-1]
            continue

        lifetimes[addr] = []
        write_list,read_list = writes[addr],reads[addr]
        w,r = 0,0
        last_write,last_read = write_list[w],read_list[r]
        active_lifetime = False

        while (w < len(write_list)):
            if (int(write_list[w]) < int(read_list[r])): # current write is before nearest-future read
                if (active_lifetime):
                    lifetimes[addr].append(last_read - last_write)
                    active_lifetime = False
                last_write = write_list[w]
                w = w+1
            elif (int(write_list[w]) >= int(read_list[r])): # current write is after or equal to nearest-future read
                active_lifetime = True
                last_read = read_list[r]
                # if there are no more reads, no more lifetimes to consider
                if (r == len(read_list) - 1):
                    lifetimes[addr].append(last_read - last_write)
                    break
                else:
                    r = r+1

        # if there are further reads after the last write
        if (r < len(read_list)):
            lifetimes[addr].append(read_list[len(read_list)-1] - last_write)

    return lifetimes

`get_freq(compute_report, layer, read_active_cycles, write_active_cycles)`

Compute read/write frequencies using SCALE-Sim COMPUTE_REPORT and active cycles.

Parameters:

Name	Type	Description	Default
`compute_report`	`str`	Path to the COMPUTE_REPORT.csv file.	required
`layer`	`int`	Layer number to analyze.	required
`read_active_cycles`	`int`	Number of cycles with read accesses.	required
`write_active_cycles`	`int`	Number of cycles with write accesses.	required

Returns:

Name	Type	Description
`list`	`list`	[read_freq, write_freq] as percentages of total cycles.

Source code in python/parse_lifetimes.py

def get_freq(compute_report: str, layer: int, read_active_cycles: int, write_active_cycles: int) -> list:
    """Compute read/write frequencies using SCALE-Sim COMPUTE_REPORT and active cycles.

    Args:
        compute_report (str): Path to the COMPUTE_REPORT.csv file.
        layer (int): Layer number to analyze.
        read_active_cycles (int): Number of cycles with read accesses.
        write_active_cycles (int): Number of cycles with write accesses.

    Returns:
        list: [read_freq, write_freq] as percentages of total cycles.
    """
    freq_data = [0.0, 0.0] # placeholder values
    total_cycles = 0

    with open(compute_report, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if int(row["LayerID"]) == layer:
                total_cycles = int(row[" Total Cycles"])
                break

    freq_data[0] = 100.0 * (read_active_cycles / (total_cycles * 1.0))
    freq_data[1] = 100.0 * (write_active_cycles / (total_cycles * 1.0))

    return freq_data

`main()`

Main entry point for parsing memory lifetimes from SCALE-Sim traces.

Parses command-line arguments and processes the specified source directory to extract memory lifetimes and access statistics, writing results to output files.

Returns:

Type	Description
`None`	None

Source code in python/parse_lifetimes.py

def main() -> None:
    """Main entry point for parsing memory lifetimes from SCALE-Sim traces.

    Parses command-line arguments and processes the specified source directory to extract
    memory lifetimes and access statistics, writing results to output files.

    Returns:
        None
    """
    ## set up
    args = parse_args()
    src_dir = args.source

    if (args.output_dir):
        output_dir = args.output_dir
    else:
        output_dir = 'output/'

    parse_lifetimes(src_dir, output_dir)

    return

`parse_args()`

Parse command-line arguments for the parse_lifetimes.py script.

Returns:

Type	Description
`Namespace`	argparse.Namespace: Parsed arguments with source and output_dir attributes.

Source code in python/parse_lifetimes.py

def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the parse_lifetimes.py script.

    Returns:
        argparse.Namespace: Parsed arguments with source and output_dir attributes.
    """
    parser = argparse.ArgumentParser(prog='parse_lifetimes.py')
    parser.add_argument('-s', '--source')
    parser.add_argument('-o', '--output_dir')

    args = parser.parse_args()

    return args

`parse_lifetimes(src_dir, output_dir)`

Parse SCALE-Sim memory access traces to extract data lifetimes and access statistics.

Parameters:

Name	Type	Description	Default
`src_dir`	`str`	Path to the directory containing layer trace files.	required
`output_dir`	`str`	Path to the output directory for processed results.	required

Returns:

Name	Type	Description
`dict`	`dict`	Dictionary with (data_type, [read_freq, write_freq, ...]) pairs for each memory type.

Source code in python/parse_lifetimes.py

def parse_lifetimes(src_dir: str, output_dir: str) -> dict:
    """Parse SCALE-Sim memory access traces to extract data lifetimes and access statistics.

    Args:
        src_dir (str): Path to the directory containing layer trace files.
        output_dir (str): Path to the output directory for processed results.

    Returns:
        dict: Dictionary with (data_type, [read_freq, write_freq, ...]) pairs for each memory type.
    """
    compute_report = os.path.dirname(src_dir) + '/COMPUTE_REPORT.csv'
    access_report = os.path.dirname(src_dir) + '/DETAILED_ACCESS_REPORT.csv'
    layer_name = os.path.basename(src_dir)
    project_name = os.path.basename(os.path.dirname(src_dir))
    output_folder = output_dir + project_name + '/' + layer_name + '/'
    output_file = output_folder + layer_name + '_lifetime_data.csv'

    lifetime_lists = {}
    aggregate_data = {}

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if os.path.exists(output_file): # already generated
        if skip_already_generated:
            print("# !! Found existing file, skipping generation...")
            return
        else:
            print("# !! Overwriting existing file...")

    for root, dirs, files in os.walk(src_dir):
        if (root != src_dir): # ignore any subfolders in this directory, only care about files in trace_list
            continue

        layer_num = int(re.search(r'layer(\d+)', layer_name).group(1))
        reads = {"ifmap": 0, "filter": 0, "ofmap": 0}
        writes = {"ifmap": 0, "filter": 0, "ofmap": 0}

        with open(access_report, 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if int(row["LayerID"]) == layer_num:
                    reads["ifmap"], writes["ifmap"] = int(row[" SRAM IFMAP Reads"]), int(row[" DRAM IFMAP Reads"])
                    reads["filter"], writes["filter"] = int(row[" SRAM Filter Reads"]), int(row[" DRAM Filter Reads"])
                    reads["ofmap"], writes["ofmap"] = int(row[" SRAM OFMAP Writes"]), int(row[" DRAM OFMAP Writes"])
                    break

        for pair in trace_list: # here is where we parse each pair of CSVs
            read_file = os.path.join(root, pair[1])
            write_file = os.path.join(root, pair[2])

            if ('FILTER' in read_file):
                data_type = 'filter'
                print('# Parsing filter traces...')
            elif ('IFMAP' in read_file):
                data_type = 'ifmap'
                print('# Parsing ifmap traces...')
            elif ('OFMAP' in read_file):
                data_type = 'ofmap'
                print('# Parsing ofmap traces...')

            ## parse
            sram_read_accesses,sram_read_cycles = create_access_list(read_file, data_type)
            sram_write_accesses,sram_write_cycles = create_access_list(write_file, data_type)
            print('# Done extracting access lists')

            ## get aggregate data
            freq_data = get_freq(compute_report, layer_num, sram_read_cycles, sram_write_cycles)
            aggregate_data[data_type] = freq_data
            aggregate_data[data_type] += [reads[data_type], writes[data_type]]

            ## calculate lifetimes
            lifetimes = find_lifetimes(sram_read_accesses, sram_write_accesses)
            lifetime_lists[data_type] = lifetimes
            print('# Done finding lifetimes')

    ## format into csv
    export_lifetimes(lifetime_lists, output_file)
    print('# Exported csv to ' + output_file)

    return aggregate_data

SCALE-Sim Graph Script

The create_graphs.py script is responsible for generating the graph representation of the computed metrics for individual workloads, including but not limited to data lifetime distributions.

create_graphs.py

Module for generating data lifetime distribution graphs from processed SCALE-Sim traces.

`graph(csv_file, layer, addt_title)`

Generate histogram plots of data lifetimes for each memory type in a layer.

Parameters:

Name	Type	Description	Default
`csv_file`	`str`	Path to the CSV file containing lifetime data.	required
`layer`	`str`	Name of the layer being analyzed.	required
`addt_title`	`str`	Additional title string for the plot.	required

Returns:

Type	Description
`Figure`	matplotlib.figure.Figure: The generated figure object.

Source code in python/create_graphs.py

def graph(csv_file: str, layer: str, addt_title: str) -> plt.Figure:
    """Generate histogram plots of data lifetimes for each memory type in a layer.

    Args:
        csv_file (str): Path to the CSV file containing lifetime data.
        layer (str): Name of the layer being analyzed.
        addt_title (str): Additional title string for the plot.

    Returns:
        matplotlib.figure.Figure: The generated figure object.
    """
    plot_types = ['ifmap', 'filter', 'ofmap']  # change if less than 3 data buffers; default is ifmap, filter, ofmap
    fig, ax = plt.subplots(len(plot_types), 1, figsize=(10, 5 * len(plot_types)))

    ## setup data
    data = {}
    for data_type in plot_types:
        data[data_type] = []
    with open(csv_file) as f:
        reader = csv.reader(f)
        # load each lifetime into corresponding list
        for row in reader:
            data_type, lifetime = row[0], row[2]
            data[data_type].append(int(lifetime))

    ## create histogram
    for i in range(len(plot_types)):
        ax[i].set_yscale('log')
        ax[i].hist(data[plot_types[i]], bins=np.arange(min(data[plot_types[i]]), max(data[plot_types[i]]) + bin_size, bin_size), edgecolor='black')
        ax[i].set_xlabel("Lifetime (cycles)")
        ax[i].set_ylabel("Frequency")
        ax[i].set_title(("Data Lifetime Frequencies, " + layer + ", " + plot_types[i] + " [" + addt_title + "]"))

    # plt.tight_layout()
    fig.subplots_adjust(hspace=0.5)

    print("# Graph produced")
    return fig

`main()`

Main entry point for generating graphs from lifetime CSV data.

Parses command-line arguments and generates graphs for the specified layer's data lifetimes.

Returns:

Type	Description
`None`	None

Source code in python/create_graphs.py

def main() -> None:
    """Main entry point for generating graphs from lifetime CSV data.

    Parses command-line arguments and generates graphs for the specified layer's data lifetimes.

    Returns:
        None
    """
    ## set up
    args = parse_args()
    csv_file = args.file
    layer = args.layer

    if (args.output):
        output_file = output_dir + args.output
    else:
        output_file = output_dir + layer + '_graphs.png'

    graphed_fig = graph(csv_file, layer)
    graphed_fig.savefig(output_file)

    return

`parse_args()`

Parse command-line arguments for the create_graphs.py script.

Returns:

Type	Description
`Namespace`	argparse.Namespace: Parsed arguments with file, output, and layer attributes.

Source code in python/create_graphs.py

def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the create_graphs.py script.

    Returns:
        argparse.Namespace: Parsed arguments with file, output, and layer attributes.
    """
    parser = argparse.ArgumentParser(prog='create_graphs.py')
    parser.add_argument('-f', '--file')
    parser.add_argument('-o', '--output')
    parser.add_argument('-l', '--layer')

    args = parser.parse_args()

    return args