Skip to content

GainSight SCALE-Sim Backend Python API Documentation

This page documents the Python scripts in gainsight/backend/python-scripts/ supporting the SCALE-Sim systolic array simulation backend using mkdocstrings. Please refer to the backend wiki for a summary of implementation details and usage instructions.


SCALE-Sim Runner Script

The run.py script is the main entry point for running the SCALE-Sim simulator. It handles command-line arguments, initializes the simulation environment, and manages the execution of the simulation.

get_aggregate(lifetime_csv_file, freq_data, aggregate_csv_file)

Parse lifetimes CSV and frequency data to compute aggregate statistics per data type.

Parameters:

Name Type Description Default
lifetime_csv_file str

Path to the CSV file with address-level lifetimes.

required
freq_data dict

Dictionary with frequency and access statistics from parse_lifetimes.

required
aggregate_csv_file str

Path to output CSV for aggregate statistics.

required

Returns:

Type Description
None

None

Source code in python/run.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def get_aggregate(lifetime_csv_file: str, freq_data: dict, aggregate_csv_file: str) -> None:
    """Parse lifetimes CSV and frequency data to compute aggregate statistics per data type.

    Args:
        lifetime_csv_file (str): Path to the CSV file with address-level lifetimes.
        freq_data (dict): Dictionary with frequency and access statistics from parse_lifetimes.
        aggregate_csv_file (str): Path to output CSV for aggregate statistics.

    Returns:
        None
    """
    df = pd.read_csv(lifetime_csv_file, header=None, names=["data_type", "addr", "lifetime"])
    stats = df.groupby("data_type")["lifetime"].agg(
        mean="mean",
        median="median",
        percentile_90=lambda x: x.quantile(0.9),
        max="max"
    )
    unique_addresses = df.groupby("data_type")["addr"].nunique()
    num_lifetimes = df['data_type'].value_counts()
    aggregate_data = {
        'ifmap': [],
        'filter': [],
        'ofmap': []
    }

    for key in aggregate_data:
        aggregate_data[key].append(stats['mean'][key])
        aggregate_data[key].append(stats['median'][key])
        aggregate_data[key].append(stats['percentile_90'][key])
        aggregate_data[key].append(stats['max'][key])
        aggregate_data[key].append(freq_data[key][0])
        aggregate_data[key].append(freq_data[key][1])
        aggregate_data[key].append(freq_data[key][2])
        aggregate_data[key].append(freq_data[key][3])
        aggregate_data[key].append(unique_addresses[key])
        aggregate_data[key].append(num_lifetimes[key])

    headers = ['subdivision','lifetime avg','lifetime median','lifetime 90th%','lifetime max','read freq','write freq','num reads','num writes','unique addresses','num_lifetimes']

    with open(aggregate_csv_file, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(headers)
        for key in aggregate_data:
            new_row = [key]
            new_row += aggregate_data[key]
            writer.writerow(new_row)

    return

main()

Main entry point for processing SCALE-Sim traces and generating GainSight data.

Parses command-line arguments, processes each layer's memory traces to extract lifetimes, aggregates statistics, and (optionally) generates graphs for each layer.

Returns:

Type Description
None

None

Source code in python/run.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def main() -> None:
    """Main entry point for processing SCALE-Sim traces and generating GainSight data.

    Parses command-line arguments, processes each layer's memory traces to extract lifetimes,
    aggregates statistics, and (optionally) generates graphs for each layer.

    Returns:
        None
    """
    ## set up
    args = parse_args()
    src_dir = args.source

    if (args.output_dir):
        output_dir = args.output_dir
    else:
        output_dir = 'output/'

    project_name = os.path.basename(src_dir)

    ## parse all layers and write to lifetime csv files, then produce graphs
    for root, dirs, files in os.walk(src_dir):
        if (root == src_dir or 'layer' not in root):
            continue

        layer_name = os.path.basename(root)
        output_subdir = output_dir + project_name + '/' + layer_name + '/'
        lifetime_csv_file = output_subdir + layer_name + '_lifetime_data.csv' # default from parse_lifetimes.py
        aggregate_csv_file = output_subdir + layer_name + '_aggregate_data.csv'
        output_graph = output_subdir + layer_name + '_graph.png'

        if os.path.exists(output_graph): # already generated
            if skip_already_done:
                print("skipping")
                continue

        print('=' * 50)
        print('Working on', layer_name)

        if not os.path.exists(output_subdir):
            os.makedirs(output_subdir)

        ## PARSE
        freq_data = parse_lifetimes.parse_lifetimes(root, output_dir) # writes to csv

        ## AGGREGATE
        get_aggregate(lifetime_csv_file, freq_data, aggregate_csv_file) # writes to csv

        ## GRAPH
        if not skip_graph:
            fig = create_graphs.graph(lifetime_csv_file, layer_name, project_name)
            fig.savefig(output_graph)
            plt.close()

        print('# DONE ALL')

    return

parse_args()

Parse command-line arguments for the run.py script.

Returns:

Type Description
Namespace

argparse.Namespace: Parsed arguments with source and output_dir attributes.

Source code in python/run.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the run.py script.

    Returns:
        argparse.Namespace: Parsed arguments with source and output_dir attributes.
    """
    parser = argparse.ArgumentParser(prog='run.py')
    parser.add_argument('-s', '--source',  required=True,
                    help="Path to folder with layer subdirs.",
                    type=os.path.abspath)
    parser.add_argument('-o', '--output_dir')

    args = parser.parse_args()

    return args

SCALE-Sim Parser Script

The parse_lifetimes.py script is responsible for parsing the trace files generated by the SCALE-Sim simulator. It extracts relevant information about data lifetime, read and write operations, and other performance metrics from the trace files.

parse_lifetimes.py

Module for parsing SCALE-Sim memory access traces to extract data lifetimes and access statistics.

create_access_list(filename, data_type)

Parse a memory trace file to extract access cycles for each address.

Parameters:

Name Type Description Default
filename str

Path to the trace CSV file.

required
data_type str

Type of data buffer (ifmap, filter, ofmap).

required

Returns:

Type Description
tuple[dict, int]

tuple[dict, int]: Dictionary mapping addresses to access cycles, and total active cycles.

Source code in python/parse_lifetimes.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def create_access_list(filename: str, data_type: str) -> tuple[dict, int]:
    """Parse a memory trace file to extract access cycles for each address.

    Args:
        filename (str): Path to the trace CSV file.
        data_type (str): Type of data buffer (ifmap, filter, ofmap).

    Returns:
        tuple[dict, int]: Dictionary mapping addresses to access cycles, and total active cycles.
    """
    active_cycles = 0 # for computing r/w frequency later
    accesses = {} # key=address, values=cycle numbers

    with open(filename, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            rw_found = False

            cycle = int(float(row[0]))
            for addr in row[1:len(row)]:
                addr_key = int(float(addr)) # gets rid of the unnecessary *.0

                # filter out false addresses (outside the range of the corresponding buffer)
                if (addr_key < 0):
                    continue
                if (data_type == 'ifmap' and (addr_key >= filter_offset or addr_key < ifmap_offset)):
                    continue
                if (data_type == 'filter' and (addr_key >= ofmap_offset or addr_key < filter_offset)):
                    continue
                if (data_type == 'ofmap' and (addr_key < ofmap_offset)):
                    continue

                rw_found = True
                if (addr_key in accesses):
                    accesses[addr_key].append(cycle)
                else:
                    accesses[addr_key] = [cycle]

            if rw_found:
                active_cycles += 1
    return accesses, active_cycles

export_lifetimes(lifetimes, csvfile)

Export lifetimes dictionary to a CSV file.

Parameters:

Name Type Description Default
lifetimes dict

Dictionary of (data_type, {address: [lifetimes]}) pairs.

required
csvfile str

Path to the output CSV file.

required

Returns:

Type Description
None

None

Source code in python/parse_lifetimes.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def export_lifetimes(lifetimes: dict, csvfile: str) -> None:
    """Export lifetimes dictionary to a CSV file.

    Args:
        lifetimes (dict): Dictionary of (data_type, {address: [lifetimes]}) pairs.
        csvfile (str): Path to the output CSV file.

    Returns:
        None
    """
    with open(csvfile, 'w') as f:
        writer = csv.writer(f, delimiter=',')
        for key in lifetimes:
            dictionary = lifetimes[key]
            for addr in dictionary:
                row = np.array([key, str(addr)])
                for lifetime in dictionary[addr]:
                    writer.writerow(np.append(row, str(lifetime)))

find_lifetimes(reads, writes)

Compute lifetimes for each address based on read and write access cycles.

Parameters:

Name Type Description Default
reads dict

Dictionary mapping addresses to read cycles.

required
writes dict

Dictionary mapping addresses to write cycles.

required

Returns:

Name Type Description
dict dict

Dictionary mapping addresses to lists of lifetimes (in cycles).

Source code in python/parse_lifetimes.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def find_lifetimes(reads: dict, writes: dict) -> dict:
    """Compute lifetimes for each address based on read and write access cycles.

    Args:
        reads (dict): Dictionary mapping addresses to read cycles.
        writes (dict): Dictionary mapping addresses to write cycles.

    Returns:
        dict: Dictionary mapping addresses to lists of lifetimes (in cycles).
    """
    lifetimes = {}
    for addr in writes:
        if (addr not in reads):
            # written but never read; should not be common
            lifetimes[addr] = [-1]
            continue

        lifetimes[addr] = []
        write_list,read_list = writes[addr],reads[addr]
        w,r = 0,0
        last_write,last_read = write_list[w],read_list[r]
        active_lifetime = False

        while (w < len(write_list)):
            if (int(write_list[w]) < int(read_list[r])): # current write is before nearest-future read
                if (active_lifetime):
                    lifetimes[addr].append(last_read - last_write)
                    active_lifetime = False
                last_write = write_list[w]
                w = w+1
            elif (int(write_list[w]) >= int(read_list[r])): # current write is after or equal to nearest-future read
                active_lifetime = True
                last_read = read_list[r]
                # if there are no more reads, no more lifetimes to consider
                if (r == len(read_list) - 1):
                    lifetimes[addr].append(last_read - last_write)
                    break
                else:
                    r = r+1

        # if there are further reads after the last write
        if (r < len(read_list)):
            lifetimes[addr].append(read_list[len(read_list)-1] - last_write)

    return lifetimes

get_freq(compute_report, layer, read_active_cycles, write_active_cycles)

Compute read/write frequencies using SCALE-Sim COMPUTE_REPORT and active cycles.

Parameters:

Name Type Description Default
compute_report str

Path to the COMPUTE_REPORT.csv file.

required
layer int

Layer number to analyze.

required
read_active_cycles int

Number of cycles with read accesses.

required
write_active_cycles int

Number of cycles with write accesses.

required

Returns:

Name Type Description
list list

[read_freq, write_freq] as percentages of total cycles.

Source code in python/parse_lifetimes.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def get_freq(compute_report: str, layer: int, read_active_cycles: int, write_active_cycles: int) -> list:
    """Compute read/write frequencies using SCALE-Sim COMPUTE_REPORT and active cycles.

    Args:
        compute_report (str): Path to the COMPUTE_REPORT.csv file.
        layer (int): Layer number to analyze.
        read_active_cycles (int): Number of cycles with read accesses.
        write_active_cycles (int): Number of cycles with write accesses.

    Returns:
        list: [read_freq, write_freq] as percentages of total cycles.
    """
    freq_data = [0.0, 0.0] # placeholder values
    total_cycles = 0

    with open(compute_report, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if int(row["LayerID"]) == layer:
                total_cycles = int(row[" Total Cycles"])
                break

    freq_data[0] = 100.0 * (read_active_cycles / (total_cycles * 1.0))
    freq_data[1] = 100.0 * (write_active_cycles / (total_cycles * 1.0))

    return freq_data

main()

Main entry point for parsing memory lifetimes from SCALE-Sim traces.

Parses command-line arguments and processes the specified source directory to extract memory lifetimes and access statistics, writing results to output files.

Returns:

Type Description
None

None

Source code in python/parse_lifetimes.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def main() -> None:
    """Main entry point for parsing memory lifetimes from SCALE-Sim traces.

    Parses command-line arguments and processes the specified source directory to extract
    memory lifetimes and access statistics, writing results to output files.

    Returns:
        None
    """
    ## set up
    args = parse_args()
    src_dir = args.source

    if (args.output_dir):
        output_dir = args.output_dir
    else:
        output_dir = 'output/'

    parse_lifetimes(src_dir, output_dir)

    return

parse_args()

Parse command-line arguments for the parse_lifetimes.py script.

Returns:

Type Description
Namespace

argparse.Namespace: Parsed arguments with source and output_dir attributes.

Source code in python/parse_lifetimes.py
266
267
268
269
270
271
272
273
274
275
276
277
278
def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the parse_lifetimes.py script.

    Returns:
        argparse.Namespace: Parsed arguments with source and output_dir attributes.
    """
    parser = argparse.ArgumentParser(prog='parse_lifetimes.py')
    parser.add_argument('-s', '--source')
    parser.add_argument('-o', '--output_dir')

    args = parser.parse_args()

    return args

parse_lifetimes(src_dir, output_dir)

Parse SCALE-Sim memory access traces to extract data lifetimes and access statistics.

Parameters:

Name Type Description Default
src_dir str

Path to the directory containing layer trace files.

required
output_dir str

Path to the output directory for processed results.

required

Returns:

Name Type Description
dict dict

Dictionary with (data_type, [read_freq, write_freq, ...]) pairs for each memory type.

Source code in python/parse_lifetimes.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def parse_lifetimes(src_dir: str, output_dir: str) -> dict:
    """Parse SCALE-Sim memory access traces to extract data lifetimes and access statistics.

    Args:
        src_dir (str): Path to the directory containing layer trace files.
        output_dir (str): Path to the output directory for processed results.

    Returns:
        dict: Dictionary with (data_type, [read_freq, write_freq, ...]) pairs for each memory type.
    """
    compute_report = os.path.dirname(src_dir) + '/COMPUTE_REPORT.csv'
    access_report = os.path.dirname(src_dir) + '/DETAILED_ACCESS_REPORT.csv'
    layer_name = os.path.basename(src_dir)
    project_name = os.path.basename(os.path.dirname(src_dir))
    output_folder = output_dir + project_name + '/' + layer_name + '/'
    output_file = output_folder + layer_name + '_lifetime_data.csv'

    lifetime_lists = {}
    aggregate_data = {}

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if os.path.exists(output_file): # already generated
        if skip_already_generated:
            print("# !! Found existing file, skipping generation...")
            return
        else:
            print("# !! Overwriting existing file...")

    for root, dirs, files in os.walk(src_dir):
        if (root != src_dir): # ignore any subfolders in this directory, only care about files in trace_list
            continue

        layer_num = int(re.search(r'layer(\d+)', layer_name).group(1))
        reads = {"ifmap": 0, "filter": 0, "ofmap": 0}
        writes = {"ifmap": 0, "filter": 0, "ofmap": 0}

        with open(access_report, 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if int(row["LayerID"]) == layer_num:
                    reads["ifmap"], writes["ifmap"] = int(row[" SRAM IFMAP Reads"]), int(row[" DRAM IFMAP Reads"])
                    reads["filter"], writes["filter"] = int(row[" SRAM Filter Reads"]), int(row[" DRAM Filter Reads"])
                    reads["ofmap"], writes["ofmap"] = int(row[" SRAM OFMAP Writes"]), int(row[" DRAM OFMAP Writes"])
                    break

        for pair in trace_list: # here is where we parse each pair of CSVs
            read_file = os.path.join(root, pair[1])
            write_file = os.path.join(root, pair[2])

            if ('FILTER' in read_file):
                data_type = 'filter'
                print('# Parsing filter traces...')
            elif ('IFMAP' in read_file):
                data_type = 'ifmap'
                print('# Parsing ifmap traces...')
            elif ('OFMAP' in read_file):
                data_type = 'ofmap'
                print('# Parsing ofmap traces...')

            ## parse
            sram_read_accesses,sram_read_cycles = create_access_list(read_file, data_type)
            sram_write_accesses,sram_write_cycles = create_access_list(write_file, data_type)
            print('# Done extracting access lists')

            ## get aggregate data
            freq_data = get_freq(compute_report, layer_num, sram_read_cycles, sram_write_cycles)
            aggregate_data[data_type] = freq_data
            aggregate_data[data_type] += [reads[data_type], writes[data_type]]

            ## calculate lifetimes
            lifetimes = find_lifetimes(sram_read_accesses, sram_write_accesses)
            lifetime_lists[data_type] = lifetimes
            print('# Done finding lifetimes')

    ## format into csv
    export_lifetimes(lifetime_lists, output_file)
    print('# Exported csv to ' + output_file)

    return aggregate_data

SCALE-Sim Graph Script

The create_graphs.py script is responsible for generating the graph representation of the computed metrics for individual workloads, including but not limited to data lifetime distributions.

create_graphs.py

Module for generating data lifetime distribution graphs from processed SCALE-Sim traces.

graph(csv_file, layer, addt_title)

Generate histogram plots of data lifetimes for each memory type in a layer.

Parameters:

Name Type Description Default
csv_file str

Path to the CSV file containing lifetime data.

required
layer str

Name of the layer being analyzed.

required
addt_title str

Additional title string for the plot.

required

Returns:

Type Description
Figure

matplotlib.figure.Figure: The generated figure object.

Source code in python/create_graphs.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def graph(csv_file: str, layer: str, addt_title: str) -> plt.Figure:
    """Generate histogram plots of data lifetimes for each memory type in a layer.

    Args:
        csv_file (str): Path to the CSV file containing lifetime data.
        layer (str): Name of the layer being analyzed.
        addt_title (str): Additional title string for the plot.

    Returns:
        matplotlib.figure.Figure: The generated figure object.
    """
    plot_types = ['ifmap', 'filter', 'ofmap']  # change if less than 3 data buffers; default is ifmap, filter, ofmap
    fig, ax = plt.subplots(len(plot_types), 1, figsize=(10, 5 * len(plot_types)))

    ## setup data
    data = {}
    for data_type in plot_types:
        data[data_type] = []
    with open(csv_file) as f:
        reader = csv.reader(f)
        # load each lifetime into corresponding list
        for row in reader:
            data_type, lifetime = row[0], row[2]
            data[data_type].append(int(lifetime))

    ## create histogram
    for i in range(len(plot_types)):
        ax[i].set_yscale('log')
        ax[i].hist(data[plot_types[i]], bins=np.arange(min(data[plot_types[i]]), max(data[plot_types[i]]) + bin_size, bin_size), edgecolor='black')
        ax[i].set_xlabel("Lifetime (cycles)")
        ax[i].set_ylabel("Frequency")
        ax[i].set_title(("Data Lifetime Frequencies, " + layer + ", " + plot_types[i] + " [" + addt_title + "]"))

    # plt.tight_layout()
    fig.subplots_adjust(hspace=0.5)

    print("# Graph produced")
    return fig

main()

Main entry point for generating graphs from lifetime CSV data.

Parses command-line arguments and generates graphs for the specified layer's data lifetimes.

Returns:

Type Description
None

None

Source code in python/create_graphs.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def main() -> None:
    """Main entry point for generating graphs from lifetime CSV data.

    Parses command-line arguments and generates graphs for the specified layer's data lifetimes.

    Returns:
        None
    """
    ## set up
    args = parse_args()
    csv_file = args.file
    layer = args.layer

    if (args.output):
        output_file = output_dir + args.output
    else:
        output_file = output_dir + layer + '_graphs.png'

    graphed_fig = graph(csv_file, layer)
    graphed_fig.savefig(output_file)

    return

parse_args()

Parse command-line arguments for the create_graphs.py script.

Returns:

Type Description
Namespace

argparse.Namespace: Parsed arguments with file, output, and layer attributes.

Source code in python/create_graphs.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def parse_args() -> argparse.Namespace:
    """Parse command-line arguments for the create_graphs.py script.

    Returns:
        argparse.Namespace: Parsed arguments with file, output, and layer attributes.
    """
    parser = argparse.ArgumentParser(prog='create_graphs.py')
    parser.add_argument('-f', '--file')
    parser.add_argument('-o', '--output')
    parser.add_argument('-l', '--layer')

    args = parser.parse_args()

    return args