#!/usr/bin/env python
import copy
import csv
import json
import math
from collections import OrderedDict
from enum import Enum
from functools import partial

import numpy as np
import tabulate
from past.utils import old_div

from hailo_model_optimization.acceleras.model_optimization_config.mo_config_model import CalibrationConfig
from hailo_model_optimization.acceleras.utils.acceleras_definitions import (
    BiasMode,
    ModelOptimizationCommand,
)
from hailo_sdk_client.allocator import integrated_hw_graph_base_pb2, integrated_hw_graph_pb2
from hailo_sdk_client.allocator.pb_wrapper import PbWrapper
from hailo_sdk_client.sdk_backend.script_parser.commands import SupportedCommands
from hailo_sdk_client.sdk_backend.sdk_backend_exceptions import ProfilingException
from hailo_sdk_common.hailo_nn.exceptions import HailoNNException
from hailo_sdk_common.hailo_nn.hn_definitions import (
    DefuseType,
    LayerType,
    PaddingType,
    ResizeMethod,
    Subclusters16x4Policy,
    UseL2WeightsPolicy,
)
from hailo_sdk_common.logger.logger import default_logger


def should_ignore_fps(hn_layer):
    return hn_layer.op == LayerType.nms


def should_ignore_layer(hn_layer):
    return hn_layer.op in [
        LayerType.input_layer,
        LayerType.output_layer,
        LayerType.external_input_layer,
        LayerType.external_output_layer,
    ]


def should_ignore_power(hn_layer):
    return (
        hn_layer.op == LayerType.nms
        or (hn_layer.op == LayerType.resize and hn_layer.resize_method == ResizeMethod.bilinear)
        or hn_layer.op == LayerType.softmax
        or (hn_layer.op == LayerType.format_conversion)
    )


def human_size_throughput(size_in_bytes, units="sec"):
    if size_in_bytes in [None, Estimator.FIELD_NOT_AVAILABLE]:
        return Estimator.FIELD_NOT_AVAILABLE

    if size_in_bytes > 2**60:
        raise ProfilingException(
            f"Input/Output throughput value calculated ({size_in_bytes / (2**60)} Petabytes/{units}) is too big",
        )

    suffixes = ["Bytes", "Kilobytes", "Megabytes", "Gigabytes", "Terabytes", "Petabytes"]
    for suffix in suffixes:
        if size_in_bytes >= 1024:
            size_in_bytes /= 1024
        else:
            return f"{size_in_bytes:.2f} {suffix}/{units}"


class Estimator:
    MAX_TOPOSORT_INDEX = 9999999999999
    FIELD_NOT_AVAILABLE = "N/A"
    JLF_NOT_SUPPORTED_ERROR_MSG = (
        "Legacy JLFs are no longer supported. Please recompile a HEF file or contact the "
        "costumer service for further assistance. "
    )

    class FIELDS:
        NAME = "layer_name"
        TYPE = "layer_type"
        HEIGHT = "input_height"
        WIDTH = "input_width"
        INPUT_CHANNELS = "input_channels"
        OUTPUT_CHANNELS = "output_channels"
        STRIDES_ROW = "strides_row"
        STRIDES_COL = "strides_col"
        KERNEL_HEIGHT = "kernel_height"
        KERNEL_WIDTH = "kernel_width"
        WEIGHTS = "weights"
        MACS = "macs"
        SCS = "scs"
        APUS = "apus"
        INPUT_ALIGNERS = "input_aligners"
        OF_PER_SC = "of_per_sc"
        OF_PER_MAC = "of_per_mac"
        L2_WEIGHTS = "l2_weights"
        L3_OUTPUT_CUTS = "l3_output_cuts"
        L3_WEIGHT_CUTS = "l3_weight_cuts"
        L4_CUTS = "l4_cuts"
        POWER = "power"
        PREDECESSORS = "predecessors"
        SUCCESSORS = "successors"
        TOTAL_CYCLES = "total cycles"
        THEORETICAL_CYCLES = "theoretical cycles"
        GROUPS = "groups"
        DILATION_HEIGHT = "dilation_height"
        DILATION_WIDTH = "dilation_width"
        CONFIG_16X4 = "16x4_configuration"
        ELEMENTWISE_FEED_REPEAT = "elementwise_feed_repeat"
        EW_ADD_ENABLED = "ew_add_enabled"
        DEFUSE_MODE = "defuse_mode"

        NODE_LAYER_INPUT_BW = "node_layer_input_bw"
        NODE_LAYER_OUTPUT_BW = "node_layer_output_bw"
        L2_WEIGHTS_USAGE_RATIO = "l2_weights_usage_ratio"

        LAYER_DEFUSE_NAME = "defuse_name"
        FPS = "fps"
        LCUS = "lcus"
        NODE_LAYER_LATENCY = "node_layer_latency"
        # Used to be ESTIMATED_TOTAL_CYCLES. Not removed to not break any direct indices
        DEPRECATED = "deprecated"  # ESTIMATED_TOTAL_CYCLES = 'estimated_total_cycles'
        OUTPUT_FEATURES_PER_SC = "output_features_per_sc"
        MAC_COMPUTATION_UTILIZATION = "mac_computation_utilization"
        APU_ACTIVATION_UTILIZATION = "apu_activation_utilization"
        SUBCLUSTER_UTILIZATION = "subcluster_utilization"

        BUFFERS_NUM = "buffers_num"
        BUFFER_SIZE = "buffer_size"
        CONTEXT = "context"
        DID_REACH_FPS = "did_reach_fps"
        IS_4BIT_ENHANCED = "is_4bit_enhanced"
        IS_SMUFFER_IN = "is_smuffer_in"
        IS_SMUFFER_OUT = "is_smuffer_out"

        LATENCY = "latency"
        OPS = "ops"
        L2_DATA_USAGE = "l2_data_usage"

        INPUT_ACTIVATION_BITS = "input_activation_bits"
        INPUT_RANGE_MIN = "input_range_min"
        INPUT_RANGE_MAX = "input_range_max"
        KERNEL_RANGE_MIN = "kernel_range_min"
        KERNEL_RANGE_MAX = "kernel_range_max"
        OUTPUT_RANGE_MIN = "output_range_min"
        OUTPUT_RANGE_MAX = "output_range_max"
        OUTPUT_ACTIVATION_BITS = "output_activation_bits"
        WEIGHTS_BITS = "weights_bits"
        BIAS_BITS = "bias_bits"

        ACTIVE_MAC_UTIL = "active_mac_util"
        WIDTH_ALIGNMENT_UTIL = "width_align_util"
        FEATURE_ALIGNMENT_UTIL = "feature_align_util"
        BALANCE_FPS_UTIL = "balance_fps_util"
        MAC_LAYERS_UTIL = "mac_layers_util"
        EFFECTIVE_MAC_UTIL = "effective_mac_util"

        FIELDS_LIST = [
            NAME,
            TYPE,
            HEIGHT,
            WIDTH,
            OUTPUT_CHANNELS,
            INPUT_CHANNELS,
            KERNEL_HEIGHT,
            KERNEL_WIDTH,
            STRIDES_ROW,
            STRIDES_COL,
            PREDECESSORS,
            SUCCESSORS,
            WEIGHTS,
            MACS,
            POWER,
            FPS,
            TOTAL_CYCLES,
            THEORETICAL_CYCLES,
            SCS,
            APUS,
            INPUT_ALIGNERS,
            OF_PER_SC,
            OF_PER_MAC,
            L3_OUTPUT_CUTS,
            L2_WEIGHTS_USAGE_RATIO,
            GROUPS,
            DILATION_HEIGHT,
            DILATION_WIDTH,
            CONFIG_16X4,
            ELEMENTWISE_FEED_REPEAT,
            EW_ADD_ENABLED,
            DEFUSE_MODE,
            NODE_LAYER_INPUT_BW,
            NODE_LAYER_OUTPUT_BW,
            LAYER_DEFUSE_NAME,
            L3_WEIGHT_CUTS,
            L4_CUTS,
            LCUS,
            NODE_LAYER_LATENCY,
            DEPRECATED,
            OUTPUT_FEATURES_PER_SC,
            MAC_COMPUTATION_UTILIZATION,
            APU_ACTIVATION_UTILIZATION,
            SUBCLUSTER_UTILIZATION,
            BUFFERS_NUM,
            BUFFER_SIZE,
            CONTEXT,
            DID_REACH_FPS,
            LATENCY,
            L2_WEIGHTS,
            OPS,
            L2_DATA_USAGE,
            INPUT_ACTIVATION_BITS,
            INPUT_RANGE_MIN,
            INPUT_RANGE_MAX,
            KERNEL_RANGE_MIN,
            KERNEL_RANGE_MAX,
            OUTPUT_RANGE_MIN,
            OUTPUT_RANGE_MAX,
            OUTPUT_ACTIVATION_BITS,
            WEIGHTS_BITS,
            BIAS_BITS,
            ACTIVE_MAC_UTIL,
            WIDTH_ALIGNMENT_UTIL,
            FEATURE_ALIGNMENT_UTIL,
            BALANCE_FPS_UTIL,
            MAC_LAYERS_UTIL,
            EFFECTIVE_MAC_UTIL,
            IS_4BIT_ENHANCED,
            IS_SMUFFER_IN,
            IS_SMUFFER_OUT,
        ]

    class POWER_FIELDS:
        MAC_COMPUTATION_POWER = "mac_computation_power"
        MAC_ACTIVATION_POWER = "mac_activation_power"
        TOTAL_WEIGHT_MEM_POWER = "total_weight_mem_power"
        L3_WEIGHT_MEM_POWER = "l3_weight_mem_power"
        L3_INPUT_READ_MEM_POWER = "l3_input_read_mem_power"
        L3_OUTPUT_WRITE_MEM_POWER = "l3_output_write_mem_power"
        L2_CONTEXT_READ_MEM_POWER = "l2_context_read_mem_power"
        L2_CONTEXT_WRITE_MEM_POWER = "l2_context_write_mem_power"
        L2_WEIGHT_MEM_POWER = "l2_weight_mem_power"
        APU_ACTIVATION_POWER = "apu_activation_power"
        INPUT_ALIGNER_SHIFT_POWER = "input_aligner_shift_power"
        LAYER_CONTROLLER_POWER = "layer_controller_power"

        FIELDS_LIST = [
            MAC_COMPUTATION_POWER,
            MAC_ACTIVATION_POWER,
            TOTAL_WEIGHT_MEM_POWER,
            L3_WEIGHT_MEM_POWER,
            L3_INPUT_READ_MEM_POWER,
            L3_OUTPUT_WRITE_MEM_POWER,
            L2_CONTEXT_READ_MEM_POWER,
            L2_CONTEXT_WRITE_MEM_POWER,
            L2_WEIGHT_MEM_POWER,
            APU_ACTIVATION_POWER,
            INPUT_ALIGNER_SHIFT_POWER,
            LAYER_CONTROLLER_POWER,
        ]

    class UTILIZATION_FIELDS:
        TOTAL_WEIGHT_MEM_UTILIZATION = "total_weight_mem_util"
        L3_WEIGHT_MEM_UTILIZATION = "l3_weight_mem_utilization"
        L2_WEIGHT_MEM_UTILIZATION = "l2_weight_mem_utilization"
        L3_INPUT_READ_MEM_UTILIZATION = "l3_input_read_mem_utilization"
        L3_OUTPUT_WRITE_MEM_UTILIZATION = "l3_output_write_mem_utilization"
        L2_CONTEXT_READ_MEM_UTILIZATION = "l2_context_read_mem_utilization"
        L2_CONTEXT_WRITE_MEM_UTILIZATION = "l2_context_write_mem_utilization"
        INPUT_ALIGNER_SHIFT_UTILIZATION = "input_aligner_shift_utilization"
        LAYER_CONTROLLER_UTILIZATION = "layer_controller_utilization"
        MAC_ACTIVATION_UTILIZATION = "mac_activation_utilization"

        FIELDS_LIST = [
            TOTAL_WEIGHT_MEM_UTILIZATION,
            L3_WEIGHT_MEM_UTILIZATION,
            L2_WEIGHT_MEM_UTILIZATION,
            L3_INPUT_READ_MEM_UTILIZATION,
            L3_OUTPUT_WRITE_MEM_UTILIZATION,
            L2_CONTEXT_READ_MEM_UTILIZATION,
            L2_CONTEXT_WRITE_MEM_UTILIZATION,
            INPUT_ALIGNER_SHIFT_UTILIZATION,
            LAYER_CONTROLLER_UTILIZATION,
            MAC_ACTIVATION_UTILIZATION,
        ]

    class METADATA_FIELDS:
        MODEL_NAME = "model_name"
        HW_ARCH = "hw_arch"
        SUBCLUSTERS = "scs"
        L3_WEIGHT_CUTS = "l3_weight_cuts"
        L3_OUTPUT_CUTS = "l3_output_cuts"
        L3_TOTAL_CUTS = "l3_total_cuts"
        LCUS = "lcus"
        LAYERS = "layers"
        WEIGHTS = "weights"
        OPS_PER_IMAGE = "ops_per_image"
        MACS_PER_IMAGE = "macs_per_image"
        TOTAL_4BIT_MACS_PER_FRAME = "total_4bit_macs_per_frame"
        PURE_OPS_PER_IMAGE = "pure_ops_per_image"
        PURE_MACS_PER_IMAGE = "pure_macs_per_image"
        POWER = "power"
        FPS = "fps"
        SUBCLUSTERS_PER_TARGET = "subclusters_per_target"
        L3_CUTS_PER_TARGET = "l3_cuts_per_target"
        LAYERS_PER_TARGET = "layers_per_target"
        L2_CUTS_PER_TARGET_NORMALIZED = "l2_cuts_per_target_normalized"
        L2_CUTS_USED_NORMALIZED = "l2_cuts_used_normalized"
        PROFILING_MODE = "profiling_mode"
        NET_INPUT_THROUGHPUT = "net_input_throughput"
        GROSS_INPUT_THROUGHPUT = "gross_input_throughput"
        NET_OUTPUT_THROUGHPUT = "net_output_throughput"
        GROSS_OUTPUT_THROUGHPUT = "gross_output_throughput"
        LATENCY = "latency"
        INPUT_ALIGNERS = "input_aligners"
        INPUT_ALIGNERS_PER_TARGET = "input_aligners_per_target"
        APUS = "apus"
        APUS_PER_TARGET = "apus_per_target"
        L4_CUTS = "l4_cuts"
        L4_CUTS_PER_TARGET = "l4_cuts_per_target"
        L2_WEIGHTS = "l2_weights"
        L2_DATA_USAGE = "l2_data_usage"
        L2_CUTS_PER_TARGET = "l2_cuts_per_target"
        NUMBER_OF_DEVICES = "number_of_devices"
        NUMBER_OF_CONTEXTS = "number_of_contexts"
        CALIBRATION = "calibration"
        TRANSPOSE = "transpose"
        RESIZE_INPUT = "resize_input"
        NORMALIZATION = "normalization"
        INPUT_CONVERSION = "input_conversion"
        NMS = "nms"
        OPTIMIZATION_LEVEL = "optimization_level"
        COMPRESSION_LEVEL = "compression_level"
        COMPRESSION_RATE = "compression_rate"
        STREAM_FPS = "stream_fps"
        OPTIMIZATION_GOAL_FPS = "optimization_goal_fps"
        CONTEXT_SWITCH_CONFIGS = "context_switch_configs"
        L2_CUT_SIZE = "l2_cut_size"
        L3_CUT_SIZE = "l3_cut_size"
        L4_CUT_SIZE = "l4_cut_size"

        FIELDS_LIST = [
            MODEL_NAME,
            SUBCLUSTERS,
            L3_WEIGHT_CUTS,
            L3_OUTPUT_CUTS,
            LCUS,
            WEIGHTS,
            OPS_PER_IMAGE,
            MACS_PER_IMAGE,
            TOTAL_4BIT_MACS_PER_FRAME,
            PURE_OPS_PER_IMAGE,
            PURE_MACS_PER_IMAGE,
            POWER,
            FPS,
            SUBCLUSTERS_PER_TARGET,
            L3_CUTS_PER_TARGET,
            LAYERS_PER_TARGET,
            L2_CUTS_PER_TARGET_NORMALIZED,
            L2_CUTS_USED_NORMALIZED,
            PROFILING_MODE,
            LAYERS,
            L3_TOTAL_CUTS,
            NET_INPUT_THROUGHPUT,
            GROSS_INPUT_THROUGHPUT,
            NET_OUTPUT_THROUGHPUT,
            GROSS_OUTPUT_THROUGHPUT,
            LATENCY,
            INPUT_ALIGNERS,
            INPUT_ALIGNERS_PER_TARGET,
            APUS,
            APUS_PER_TARGET,
            L4_CUTS,
            L4_CUTS_PER_TARGET,
            L2_WEIGHTS,
            L2_DATA_USAGE,
            L2_CUTS_PER_TARGET,
            NUMBER_OF_DEVICES,
            NUMBER_OF_CONTEXTS,
            CALIBRATION,
            TRANSPOSE,
            RESIZE_INPUT,
            NORMALIZATION,
            INPUT_CONVERSION,
            NMS,
            OPTIMIZATION_LEVEL,
            COMPRESSION_LEVEL,
            COMPRESSION_RATE,
            HW_ARCH,
            STREAM_FPS,
            OPTIMIZATION_GOAL_FPS,
            CONTEXT_SWITCH_CONFIGS,
            L2_CUT_SIZE,
            L3_CUT_SIZE,
            L4_CUT_SIZE,
        ]

    class PER_CONTEXT_FIELDS:
        CONTEXT_NAME = "context_name"
        CONTEXT_SWITCH_SCS_MAC_UTIL = "csw_scs_mac_util"
        BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL = "b8_csw_scs_mac_util"
        SPACE_1 = " "
        LCU_UTIL = "lcu_util"
        L3_UTIL = "l3_util"
        MAPPING_UTIL = "mapping_util"
        ACTIVE_MAC_UTIL = "active_mac_util"
        WIDTH_ALIGNMENT_UTIL = "width_align_util"
        FEATURE_ALIGNMENT_UTIL = "feature_align_util"
        BALANCE_FPS_UTIL = "balance_fps_util"
        MAC_LAYERS_UTIL = "mac_layers_util"
        EFFECTIVE_MAC_UTIL = "effective_mac_util"
        SCS_MAC_UTIL = "scs_mac_util"
        SPACE_2 = " "
        FPS = "fps"
        INVERSE_FPS = "1/fps (s)"
        DRAIN = "drain (s)"
        LATENCY_UTIL = "latency_util"
        BATCH8_LATENCY_UTIL = "b8_latency_util"
        SPACE_3 = " "
        CONTEXT_OVERHEAD = "context_overhead"
        OVERHEAD_UTIL = "overhead_util"
        BATCH8_OVERHEAD_UTIL = "b8_overhead_util"
        TOTAL_MAC_UTIL = "total_mac_util"
        BATCH8_TOTAL_MAC_UTIL = "b8_total_mac_util"
        SPACE_4 = " "
        BOUNDARY_IN = "boundary_in_bytes"
        BOUNDARY_OUT = "boundary_out_bytes"
        INTER_CONTEXT_IN = "inter_context_in_bytes"
        INTER_CONTEXT_OUT = "inter_context_out_bytes"
        DDR_PORTALS_IN = "ddr_portals_in_bytes"
        DDR_PORTALS_OUT = "ddr_portals_out_bytes"

        FIELDS_LIST = [
            CONTEXT_NAME,
            CONTEXT_SWITCH_SCS_MAC_UTIL,
            BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL,
            SPACE_1,
            LCU_UTIL,
            L3_UTIL,
            MAPPING_UTIL,
            ACTIVE_MAC_UTIL,
            WIDTH_ALIGNMENT_UTIL,
            FEATURE_ALIGNMENT_UTIL,
            BALANCE_FPS_UTIL,
            MAC_LAYERS_UTIL,
            EFFECTIVE_MAC_UTIL,
            SCS_MAC_UTIL,
            SPACE_2,
            FPS,
            INVERSE_FPS,
            DRAIN,
            LATENCY_UTIL,
            BATCH8_LATENCY_UTIL,
            SPACE_3,
            CONTEXT_OVERHEAD,
            OVERHEAD_UTIL,
            BATCH8_OVERHEAD_UTIL,
            TOTAL_MAC_UTIL,
            BATCH8_TOTAL_MAC_UTIL,
            BOUNDARY_IN,
            BOUNDARY_OUT,
            INTER_CONTEXT_IN,
            INTER_CONTEXT_OUT,
            DDR_PORTALS_IN,
            DDR_PORTALS_OUT,
        ]

    class PERFORMANCE_DETAILS_FIELDS:
        BATCH_SIZE = "batch_size"
        FPS = "fps"
        LATENCY = "latency"
        POWER = "power"
        NUMBER_OF_CONTEXTS = "number_of_contexts"
        INPUT_BW = "input_bw"
        OUTPUT_BW = "output_bw"
        OPS_PER_SECOND = "ops_per_second"
        MEASURED_MAC_UTIL = "measured_mac_util"
        STREAM_FPS = "stream_fps"

    class ACCURACY_FIELDS:
        NUMBER_OF_BITS = "number_of_bits"
        INPUT_BITS = "input"
        WEIGHT_BITS = "weights"
        OUTPUT_BITS = "output"

    def _get_runtime_fields_class(self, hw_arch):
        class RUNTIME_FIELDS:
            LCU_DONE = "lcu_interrupt"
            MODULE_CONFIG_DONE = "module_config_done_interrupt"
            SEQUENCER_START = "trigger_sequencer"
            SEQUENCER_DONE = "sequencer_done_interrupt"
            ACTIVATE_INPUT = ["activate_inter_context_input", "activate_boundary_input"]
            NETWORK_GROUPS = "network_groups"
            CONTEXTS = "contexts"
            CONTEXT_NAME = "context_name"
            PRELIMINARY = "preliminary"
            ACTIONS = "actions"
            TYPE = "type"
            START = "start"
            END = "end"
            TIMESTAMP = "timestamp"
            DATA = "data"
            DESCRIPTORS_COUNT = "descriptors_count"
            MODULE_INDEX = "module_index"
            CLUSTER_INDEX = "cluster_index"
            SEQUENCER_INDEX = "sequencer_index"
            OVERHEAD = "overhead"
            BW = "bandwidth"
            LAYERS = "layers"
            CONFIGURATIONS = "configurations"
            SEQUENCER = "sequencer"
            TOTAL_CONFIGS = "total_configs"
            CONFIGS_BW = "configs_bw"
            DDR_BW = "ddr_bw"
            INTER_CONTEXT_BW = "inter_context_bw"
            BOUNDARIES_BW = "boundaries"
            ACTUAL_START_TIME = "actual_start_time"
            LABEL = "label"
            CLOCK = "clock_MHz"
            CONTEXT_TIME = "context_time"
            CCW_BURSTS = "ccw_bursts"
            CONFIG_STREAM_INDEX = "config_stream_index"
            BATCH_SIZE = "batch_size"
            RUNTIME_PERCENTAGES = "runtime_percentages"
            RUNS = "runs"

        if hw_arch.is_mercury_arch:
            RUNTIME_FIELDS.FETCH_DESCRIPTORS = "fetch_ccw_bursts"
        else:
            RUNTIME_FIELDS.FETCH_DESCRIPTORS = "fetch_cfg_channel_descriptors"
        return RUNTIME_FIELDS

    @staticmethod
    def _get_context_by_layer_index(integrated_hw_graph):
        result = {}
        if not integrated_hw_graph.HasField("contexts_graph"):
            # Legacy JLFs without context information support
            raise ProfilingException(Estimator.JLF_NOT_SUPPORTED_ERROR_MSG)
        for context_node in integrated_hw_graph.contexts_graph.nodes:
            for resources in context_node.network_graph.resources_list:
                result[resources.layer_index] = context_node.context.context_name
        return result

    @staticmethod
    def _get_context_by_node(integrated_hw_graph):
        result = {}
        if not integrated_hw_graph.HasField("contexts_graph"):
            # Legacy JLFs without context information support
            raise ProfilingException(Estimator.JLF_NOT_SUPPORTED_ERROR_MSG)
        contexts_graph = integrated_hw_graph.contexts_graph
        for context_node in contexts_graph.nodes:
            context_name = context_node.context.context_name
            for node in context_node.network_graph.nodes:
                result[node.name] = context_name
        return result

    @staticmethod
    def _get_contexts_count(integrated_hw_graph):
        if not integrated_hw_graph.HasField("contexts_graph"):
            return 1

        return len(integrated_hw_graph.contexts_graph.nodes)

    def __init__(
        self,
        hw_arch,
        mapped_graph_data,
        hn,
        clk_freq,
        profiling_mode,
        original_hailo_nn=None,
        should_use_logical_layers=True,
        translated_params=None,
        debug=False,
        runtime_data=None,
        hef_proto=None,
        script_parser=None,
        flavor_config=None,
        mo_flavor=None,
        params=None,
        stream_fps=None,
        accuracy_data=None,
    ):
        if not mapped_graph_data:
            raise ProfilingException("Mapped graph data is missing")
        self._logger = default_logger()
        self._hef_proto = hef_proto
        integrated_hw_graph = integrated_hw_graph_pb2.ProtoIntegratedHWGraph()
        integrated_hw_graph.ParseFromString(mapped_graph_data)
        self._graph = integrated_hw_graph.network_graph
        self._context_by_node = self._get_context_by_node(integrated_hw_graph)
        self._context_by_layer_id = self._get_context_by_layer_index(integrated_hw_graph)
        self._number_of_contexts = self._get_contexts_count(integrated_hw_graph)
        self._is_multi_context = self._number_of_contexts > 1
        if self._is_multi_context:
            self._runtime_data = runtime_data
        else:
            if runtime_data is not None:
                self._logger.warning(
                    "Runtime data is available, but ignored since the model was compiled to a single context compilation.",
                )
            self._runtime_data = None
        self._clk_freq = clk_freq
        self._nnm_clk_freq = None
        self._hw_arch = hw_arch
        self.RUNTIME_FIELDS = self._get_runtime_fields_class(self._hw_arch)
        self._hn = hn.from_integrated_pb_data(mapped_graph_data, PbWrapper())
        self._hn.name = hn.name
        self._layers_by_index = self._hn.layers_by_index
        self._min_fps = integrated_hw_graph.bottleneck_fps
        self._stream_fps = None
        if stream_fps is not None:
            if self._is_multi_context:
                self._logger.warning("stream_fps argument is not supported for multi-context cases and will be ignored")
            elif self._runtime_data is not None:
                self._logger.warning("stream_fps argument is not supported when using runtime data and will be ignored")
            elif stream_fps > self._min_fps:
                self._logger.warning(
                    f"stream_fps value {stream_fps:.2f} is higher than the bottleneck FPS "
                    f"{self._min_fps:.2f} and will be ignored. Instead, the bottleneck FPS will be "
                    f"used for power and bandwidth calculation",
                )
            else:
                self._stream_fps = stream_fps
        self._fps_multi_context = None
        self._latency_multi_context = None
        self._mac_util = None
        self._device_ops_per_second = (
            self._hw_arch.consts["CORE_PKG::ACTUAL_CLUSTERS"]
            * self._hw_arch.consts["CLUSTER_UNITS::SUBCLUSTERS"]
            * 64  # macs per sub-cluster
            * self._clk_freq
            * 2  # macs, not ops
        )

        self._profiler_data = integrated_hw_graph.profiler_data
        if self._profiler_data.ByteSize() == 0:
            self._logger.warning(
                "There is no profiler data, make sure to use hef that was created by sdk_version >= 3.14.0",
            )
        self._original_hailo_nn = original_hailo_nn
        self._profiling_mode = profiling_mode
        self._context_min_fps = self._calculate_min_fps_per_context(integrated_hw_graph)
        self._total_configs = None
        self._rows = []
        self._should_use_logical_layers = should_use_logical_layers
        self._translated_params = dict(translated_params) if translated_params else {}
        self._accuracy_data = accuracy_data
        self._debug = debug
        self._is_multi_scope = len(self._hn.net_params.net_scopes) > 1

        # Create topological sort of layers by name
        if self._original_hailo_nn and self._should_use_logical_layers:
            hailo_nn_to_toposort = self._original_hailo_nn
        else:
            hailo_nn_to_toposort = self._hn
        self._layers_topological_sort = [
            self._get_layer_name(layer) for layer in hailo_nn_to_toposort.stable_toposort()
        ]

        self._fields_to_aggregate = self._get_fields_to_aggregate()
        self._processed_runtime_data = {}
        self._performance_from_runtime_data = []
        self._params = dict(params) if params else {}
        self._optimization_commands = {}
        self._modification_commands = {}
        self._optimization_goal_fps = None
        if script_parser:
            self._optimization_commands = script_parser.export_model_optimization_commands()
            self._modification_commands = {
                command.function_name.value: command for command in script_parser.export_model_modifications_commands()
            }
            if not self._is_multi_context:
                self._optimization_goal_fps = script_parser.export_optimization_goal_fps()
        self._mo_flavor = mo_flavor
        self._flavor_config = flavor_config if flavor_config else {}
        self._context_bw = {}
        self._context_ignored_output_blocks = {}

        self._calculate_total()
        self._process_runtime_data()
        if self._hef_proto:
            self.net_input_throughput, self.gross_input_throughput = self._total_throughput(input_throughput=True)
            self.net_output_throughput, self.gross_output_throughput = self._total_throughput()

        if not self._hef_proto or (self._is_multi_context and self._runtime_data is None):
            self.net_input_throughput = None
            self.gross_input_throughput = None
            self.net_output_throughput = None
            self.gross_output_throughput = None
        else:
            if self._is_multi_context:
                fps_to_use = self._fps_multi_context
            else:
                fps_to_use = self._stream_fps if self._stream_fps is not None else self._min_fps
            if fps_to_use == self.FIELD_NOT_AVAILABLE:
                self.net_input_throughput = self.FIELD_NOT_AVAILABLE
                self.gross_input_throughput = self.FIELD_NOT_AVAILABLE
                self.net_output_throughput = self.FIELD_NOT_AVAILABLE
                self.gross_output_throughput = self.FIELD_NOT_AVAILABLE
            else:
                self.net_input_throughput *= fps_to_use
                self.gross_input_throughput *= fps_to_use
                self.net_output_throughput *= fps_to_use
                self.gross_output_throughput *= fps_to_use

        self._parse_resources()

    def _normalize_time_clk_period(self, end_time, start_time=0):
        return ((end_time - start_time) / self._clk_freq) * 1000

    def _get_nnm_clk_freq(self):
        return self._nnm_clk_freq * 10**6

    def get_latency_data(self):
        result = {}
        for context in self._profiler_data.context:
            context_data = {}
            for layer in context.layers:
                time_range = [(time.start_time / self._clk_freq, time.end_time / self._clk_freq) for time in layer.time]
                context_data[layer.layer_name] = time_range
            result[context.context_name] = context_data
        result["runtime_data"] = self._processed_runtime_data
        return result

    def _get_fields_to_aggregate(self):
        min_float = partial(self._combine_field, min, float)
        max_float = partial(self._combine_field, max, float)
        max_int = partial(self._combine_field, max, int)
        sum_int = partial(self._combine_field, sum, int)
        sum_float = partial(self._combine_field, sum, float)
        first_of = partial(self._combine_field, lambda vals: vals[0], lambda val: val)
        bool_and = partial(self._combine_field, all, lambda x: str(x) == "True")
        fields = {
            self.FIELDS.FPS: min_float,
            self.FIELDS.TOTAL_CYCLES: max_int,
            self.FIELDS.THEORETICAL_CYCLES: max_int,
            self.FIELDS.SCS: sum_float,
            self.FIELDS.APUS: sum_float,
            self.FIELDS.INPUT_ALIGNERS: sum_float,
            self.FIELDS.L2_WEIGHTS: sum_int,
            self.FIELDS.L3_OUTPUT_CUTS: sum_float,
            self.FIELDS.L3_WEIGHT_CUTS: sum_float,
            self.FIELDS.L4_CUTS: sum_float,
            self.FIELDS.LCUS: sum_float,
            self.FIELDS.POWER: sum_float,
            self.FIELDS.NODE_LAYER_LATENCY: max_float,
            self.FIELDS.DEPRECATED: max_int,
            self.FIELDS.CONTEXT: first_of,
            self.FIELDS.DID_REACH_FPS: bool_and,
            self.FIELDS.IS_4BIT_ENHANCED: bool_and,
            self.FIELDS.IS_SMUFFER_OUT: bool_and,
            self.FIELDS.IS_SMUFFER_IN: bool_and,
            self.FIELDS.LATENCY: max_float,
            self.FIELDS.MACS: sum_int,
            self.FIELDS.OF_PER_SC: max_int,
            self.FIELDS.OF_PER_MAC: max_int,
            self.FIELDS.NODE_LAYER_INPUT_BW: sum_float,
            self.FIELDS.NODE_LAYER_OUTPUT_BW: sum_float,
            self.FIELDS.BUFFERS_NUM: sum_float,
            self.FIELDS.BUFFER_SIZE: sum_int,
            self.FIELDS.OUTPUT_FEATURES_PER_SC: max_int,
            self.FIELDS.OPS: sum_int,
            self.FIELDS.L2_DATA_USAGE: sum_int,
        }

        if self._debug:
            fields.update(
                {
                    self.POWER_FIELDS.MAC_COMPUTATION_POWER: sum_float,
                    self.POWER_FIELDS.MAC_ACTIVATION_POWER: sum_float,
                    self.POWER_FIELDS.TOTAL_WEIGHT_MEM_POWER: sum_float,
                    self.POWER_FIELDS.L3_WEIGHT_MEM_POWER: sum_float,
                    self.POWER_FIELDS.L3_INPUT_READ_MEM_POWER: sum_float,
                    self.POWER_FIELDS.L3_OUTPUT_WRITE_MEM_POWER: sum_float,
                    self.POWER_FIELDS.L2_CONTEXT_READ_MEM_POWER: sum_float,
                    self.POWER_FIELDS.L2_CONTEXT_WRITE_MEM_POWER: sum_float,
                    self.POWER_FIELDS.L2_WEIGHT_MEM_POWER: sum_float,
                    self.POWER_FIELDS.APU_ACTIVATION_POWER: sum_float,
                    self.POWER_FIELDS.INPUT_ALIGNER_SHIFT_POWER: sum_float,
                    self.POWER_FIELDS.LAYER_CONTROLLER_POWER: sum_float,
                },
            )

        return fields

    @staticmethod
    def _set_hn_layer_attr(row, field_name, hn_layer, attribute):
        if hasattr(hn_layer, attribute):
            attr = getattr(hn_layer, attribute)
            if isinstance(attr, Enum):
                row[field_name] = attr.value
            else:
                row[field_name] = attr

    def _add_network_information_to_layer(self, hn_layer, hailo_nn, row=None, sub_layer=None):
        row = {} if row is None else row
        FIELDS = self.FIELDS

        if hn_layer:
            row[FIELDS.PREDECESSORS] = " ".join(
                [self._get_layer_name(layer) for layer in hailo_nn.predecessors(hn_layer)]
            )
            row[FIELDS.SUCCESSORS] = " ".join([self._get_layer_name(layer) for layer in hailo_nn.successors(hn_layer)])
            self._add_layer_optimization_information(row, hn_layer)

        if not sub_layer:
            if hn_layer:
                sub_layer = hn_layer
            else:
                return row

        if (
            sub_layer.op in [LayerType.concat, LayerType.output_mux, LayerType.proposal_generator, LayerType.matmul]
            or (sub_layer.op == LayerType.conv and sub_layer.padding is PaddingType.valid)
            or (sub_layer.op == LayerType.merged_layer and sub_layer.is_concat_first)
        ):
            self._unequal_output_shape_dims_layer_attr(row, sub_layer)
        else:
            self._set_hn_layer_attr(row, FIELDS.INPUT_CHANNELS, sub_layer, "input_features")
            self._set_hn_layer_attr(row, FIELDS.WIDTH, sub_layer, "input_width")
            self._set_hn_layer_attr(row, FIELDS.HEIGHT, sub_layer, "input_height")

        self._set_hn_layer_attr(row, FIELDS.OUTPUT_CHANNELS, sub_layer, "output_features")
        self._set_hn_layer_attr(row, FIELDS.STRIDES_ROW, sub_layer, "stride_height")
        self._set_hn_layer_attr(row, FIELDS.STRIDES_COL, sub_layer, "stride_width")
        self._set_hn_layer_attr(row, FIELDS.KERNEL_HEIGHT, sub_layer, "kernel_height")
        self._set_hn_layer_attr(row, FIELDS.KERNEL_WIDTH, sub_layer, "kernel_width")
        self._set_hn_layer_attr(row, FIELDS.GROUPS, sub_layer, "groups")
        self._set_hn_layer_attr(row, FIELDS.EW_ADD_ENABLED, sub_layer, "ew_add_enabled")
        self._get_elementwise_feed_repeat(row, sub_layer)
        self._get_defuse_type(row, sub_layer)
        self._get_dilations(row, sub_layer)
        row[FIELDS.CONFIG_16X4] = sub_layer.get_compilation_params("use_16x4_sc") == Subclusters16x4Policy.enabled

        return row

    def _add_layer_optimization_information(self, row, hn_layer):
        try:
            row[self.FIELDS.INPUT_ACTIVATION_BITS] = hn_layer.precision_config.precision_mode.input_bits()
        except ValueError:
            row[self.FIELDS.INPUT_ACTIVATION_BITS] = self.FIELD_NOT_AVAILABLE
        try:
            row[self.FIELDS.WEIGHTS_BITS] = hn_layer.precision_config.precision_mode.weight_bits()
        except ValueError:
            row[self.FIELDS.WEIGHTS_BITS] = self.FIELD_NOT_AVAILABLE
        min_values, max_values = self._get_input_range(hn_layer)
        row[self.FIELDS.INPUT_RANGE_MIN] = min_values[0]
        row[self.FIELDS.INPUT_RANGE_MAX] = max_values[0]
        row[self.FIELDS.KERNEL_RANGE_MIN] = self._get_kernel_min_range(hn_layer)
        row[self.FIELDS.KERNEL_RANGE_MAX] = self._get_kernel_max_range(hn_layer)
        row[self.FIELDS.OUTPUT_RANGE_MIN], row[self.FIELDS.OUTPUT_RANGE_MAX] = self._get_output_range(hn_layer)
        try:
            row[self.FIELDS.OUTPUT_ACTIVATION_BITS] = hn_layer.precision_config.precision_mode.output_bits()
        except ValueError:
            row[self.FIELDS.OUTPUT_ACTIVATION_BITS] = self.FIELD_NOT_AVAILABLE
        row[self.FIELDS.BIAS_BITS] = self._get_bias_bits(hn_layer, row[self.FIELDS.WEIGHTS_BITS])

    @staticmethod
    def _get_kernel(hn_layer_name, params):
        full_hn_name = hn_layer_name + "/kernel:0"
        if full_hn_name in params:
            return params[full_hn_name]

    def _get_bias_bits(self, hn_layer, weight_bits):
        bias_mode = hn_layer.precision_config.bias_mode
        if weight_bits != self.FIELD_NOT_AVAILABLE and bias_mode in [
            BiasMode.double_scale_initialization,
            BiasMode.double_scale_decomposition,
        ]:
            return 2 * weight_bits

        return weight_bits

    def _get_input_range(self, hn_layer):
        ranges = []
        if len(hn_layer.inputs) > 1:
            for input_layer in hn_layer.inputs:
                ranges.append(self._get_io_range(f"{input_layer}/limvals_out:0", self._translated_params))
        else:
            ranges.append(self._get_io_range(f"{hn_layer.name}/limvals_in:0", self._translated_params))

        return list(zip(*ranges))

    def _get_output_range(self, hn_layer):
        return self._get_io_range(f"{hn_layer.name}/limvals_out:0", self._translated_params)

    @staticmethod
    def _get_io_range(io_range_key, params):
        if io_range_key in params:
            min_value, max_value = params[io_range_key][:2]
            return float(min_value), float(max_value)

        return [Estimator.FIELD_NOT_AVAILABLE] * 2

    def _get_kernel_max_range(self, hn_layer):
        return self._get_kernel_range(hn_layer, np.max)

    def _get_kernel_min_range(self, hn_layer):
        return self._get_kernel_range(hn_layer, np.min)

    def _get_kernel_range(self, hn_layer, method):
        kernel = self._get_kernel(hn_layer.name, self._params)
        if kernel is not None:
            return float(method(kernel))

        return Estimator.FIELD_NOT_AVAILABLE

    def _get_defuse_type(self, row, hn_layer):
        if hasattr(hn_layer, "defuse_type") and hn_layer.defuse_type != DefuseType.none:
            row[self.FIELDS.DEFUSE_MODE] = hn_layer.defuse_type.value

    def _get_dilations(self, row, hn_layer):
        if hasattr(hn_layer, "dilations") and hn_layer.dilations is not None and len(hn_layer.dilations) == 4:
            row[self.FIELDS.DILATION_HEIGHT] = hn_layer.dilations[1]
            row[self.FIELDS.DILATION_WIDTH] = hn_layer.dilations[2]

    def _get_elementwise_feed_repeat(self, row, hn_layer):
        layer_params = self._translated_params.get(hn_layer.name)
        if layer_params and layer_params.get("elementwise_addition/feed_repeat"):
            row[self.FIELDS.ELEMENTWISE_FEED_REPEAT] = layer_params.get("elementwise_addition/feed_repeat")

    def _unequal_output_shape_dims_layer_attr(self, row, hn_layer):
        height = []
        width = []
        features = []

        for shape in hn_layer.input_shapes:
            if len(shape) == 4:
                height.append(str(shape[1]))
                width.append(str(shape[2]))
            features.append(str(shape[-1]))

        row[self.FIELDS.HEIGHT] = "|".join(height)
        row[self.FIELDS.WIDTH] = "|".join(width)
        row[self.FIELDS.INPUT_CHANNELS] = "|".join(features)

    def _get_theoretical_cycles(self, hn_layer, subclusters):
        theoretical_cycles = None
        if hn_layer.op == LayerType.conv:
            theoretical_cycles = math.ceil(
                old_div(
                    hn_layer.input_features
                    * hn_layer.output_features
                    * math.ceil(old_div(hn_layer.input_height, hn_layer.stride_height))
                    * math.ceil(old_div(hn_layer.input_width, hn_layer.stride_width))
                    * hn_layer.kernel_height
                    * hn_layer.kernel_width,
                    (subclusters * 64.0),
                ),
            )
            theoretical_cycles /= hn_layer.groups
        elif hn_layer.op == LayerType.dw:
            theoretical_cycles = math.ceil(
                old_div(
                    hn_layer.input_features
                    * math.ceil(old_div(hn_layer.input_height, hn_layer.stride_height))
                    * math.ceil(old_div(hn_layer.input_width, hn_layer.stride_width))
                    * hn_layer.kernel_height
                    * hn_layer.kernel_width,
                    (subclusters * 64.0),
                ),
            )

        return theoretical_cycles

    def _add_layer_shapes(self, resources):
        row = {}
        FIELDS = self.FIELDS

        if resources.layer_index in self._layers_by_index:
            hn_layer = self._layers_by_index[resources.layer_index]
            row = self._add_network_information_to_layer(hn_layer, self._hn)
            theoretical_cycles = self._get_theoretical_cycles(hn_layer, resources.subclusters)
            if theoretical_cycles is not None:
                row[FIELDS.THEORETICAL_CYCLES] = theoretical_cycles
            if resources.subclusters > 0:
                row[FIELDS.OF_PER_SC] = old_div(resources.padded_output_features, resources.subclusters)
                if hn_layer.op == LayerType.feature_splitter:
                    row[FIELDS.FEATURE_ALIGNMENT_UTIL] = 0
                else:
                    row[FIELDS.FEATURE_ALIGNMENT_UTIL] = hn_layer.output_features / resources.padded_output_features
                try:
                    row[FIELDS.WIDTH_ALIGNMENT_UTIL] = (
                        hn_layer.input_width / (math.ceil(hn_layer.input_width / 16) * 16)
                        if row[FIELDS.CONFIG_16X4]
                        else hn_layer.input_width / (math.ceil(hn_layer.input_width / 8) * 8)
                    )
                except Exception:
                    row[FIELDS.WIDTH_ALIGNMENT_UTIL] = 0

            if resources.l3_weight_mem_utilization:
                row[FIELDS.L2_WEIGHTS_USAGE_RATIO] = old_div(
                    resources.l2_weight_mem_utilization,
                    (resources.l2_weight_mem_utilization + resources.l3_weight_mem_utilization),
                )

            row[FIELDS.OF_PER_MAC] = resources.features_per_mac

        return row

    def _get_context_resources_min_fps(self, resources):
        if not self._is_multi_context:
            return self._min_fps
        context_name = self._context_by_layer_id[resources.layer_index]
        return self._context_min_fps[context_name]

    def _power(self, resources):
        power = (
            resources.mac_computation_power
            + resources.mac_activation_power
            + resources.l3_weight_mem_power
            + resources.l3_input_read_mem_power
            + resources.l3_output_write_mem_power
            + resources.l2_context_read_mem_power
            + resources.l2_context_write_mem_power
            + resources.l2_weight_mem_power
            + resources.apu_activation_power
            + resources.input_aligner_shift_power
            + resources.layer_controller_power
        )

        if power == 0:
            return power

        context_min_fps = self._get_context_resources_min_fps(resources)
        if context_min_fps in [0, Estimator.FIELD_NOT_AVAILABLE]:
            raise ProfilingException("Bottleneck FPS is 0 but power > 0")

        fps = self._stream_fps if self._stream_fps is not None else self._min_fps
        factor = fps / context_min_fps

        return power * factor

    def _calculate_min_fps_per_context(self, integrated_hw_graph):
        result = {}
        if not integrated_hw_graph.HasField("contexts_graph"):
            # Legacy JLFs without context information support
            raise ProfilingException(Estimator.JLF_NOT_SUPPORTED_ERROR_MSG)

        contexts_graph = integrated_hw_graph.contexts_graph
        for context_node in contexts_graph.nodes:
            context_name = context_node.context.context_name
            context_fps = [
                self._calculate_layer_fps(None, resources)
                for resources in context_node.network_graph.resources_list
                if resources.frame_total_cycles != 0
            ]
            # in rare cases, such as a context with only nms layers, we will use the network's fps.
            context_fps = [fps for fps in context_fps if fps is not None]
            if len(context_fps) > 0:
                min_fps = min(fps for fps in context_fps if fps is not None)
            else:
                min_fps = Estimator.FIELD_NOT_AVAILABLE
            result[context_name] = min_fps

        return result

    def _row(self, resources, hn_layer=None):
        row = {field: "" for field in self.FIELDS.FIELDS_LIST}
        FIELDS = self.FIELDS

        if not hn_layer:
            if resources.layer_index in self._layers_by_index:
                hn_layer = self._layers_by_index[resources.layer_index]
                row[FIELDS.NAME] = self._get_layer_name(hn_layer)
                row[FIELDS.TYPE] = hn_layer.op.name
            else:
                return {}

        row[FIELDS.DID_REACH_FPS] = resources.did_reach_fps
        row[FIELDS.IS_4BIT_ENHANCED] = resources.is_4bit_enhanced
        row[FIELDS.IS_SMUFFER_IN] = resources.is_smuffer_in
        row[FIELDS.IS_SMUFFER_OUT] = resources.is_smuffer_out
        row[FIELDS.BUFFERS_NUM] = 0
        for entry in resources.buffers:
            row[FIELDS.BUFFERS_NUM] += entry.buffers

        row[FIELDS.BUFFER_SIZE] = resources.output_size
        # for backwards compatibility. version 3.12 single scope networks contain the scope in the name.
        layer_context_key = hn_layer.name if hn_layer.name in self._context_by_node else self._get_layer_name(hn_layer)
        row[FIELDS.CONTEXT] = self._context_by_node[layer_context_key]
        layer_power = self._power(resources)
        if (layer_power == 0 or should_ignore_layer(hn_layer)) and not should_ignore_power(hn_layer):
            if row[FIELDS.CONTEXT] not in self._context_ignored_output_blocks:
                self._context_ignored_output_blocks[row[FIELDS.CONTEXT]] = 0
            self._context_ignored_output_blocks[row[FIELDS.CONTEXT]] += resources.output_blocks
            return None

        row[FIELDS.LAYER_DEFUSE_NAME] = hn_layer.defuse_name
        row[FIELDS.MACS] = hn_layer.macs
        row[FIELDS.OPS] = hn_layer.ops
        row[FIELDS.MAC_LAYERS_UTIL] = 1.0 if row[FIELDS.MACS] > 0 else 0.0
        row[FIELDS.WEIGHTS] = hn_layer.weights
        show_power = self._hw_arch.does_support_power_profiling and not should_ignore_power(hn_layer)
        row[FIELDS.POWER] = f"{layer_power:.2f}" if show_power else None
        row[FIELDS.LATENCY] = resources.node_end_time / self._clk_freq

        # Calculate fps for the layer
        self._set_layer_cluster_util_and_fps(hn_layer, resources, row)

        row[FIELDS.TOTAL_CYCLES] = resources.frame_total_cycles
        row[FIELDS.SCS] = resources.subclusters
        row[FIELDS.LCUS] = 1
        is_mixed_mems = hn_layer.get_compilation_params("mixed_mem") == UseL2WeightsPolicy.enabled
        if is_mixed_mems and hn_layer.weights > 0:
            row[FIELDS.L2_WEIGHTS] = min(hn_layer.weights, resources.l2_weights * resources.subclusters)
        else:
            row[FIELDS.L2_WEIGHTS] = 0
        row[FIELDS.L3_OUTPUT_CUTS] = resources.output_blocks
        row[FIELDS.L3_WEIGHT_CUTS] = resources.weights_blocks
        row[FIELDS.L4_CUTS] = resources.l4_cuts

        row[FIELDS.L2_WEIGHTS_USAGE_RATIO] = 1
        row[FIELDS.NODE_LAYER_INPUT_BW] = resources.node_layer_input_bw
        row[FIELDS.NODE_LAYER_OUTPUT_BW] = resources.node_layer_output_bw
        row[FIELDS.MAC_COMPUTATION_UTILIZATION] = resources.mac_computation_utilization
        row[FIELDS.ACTIVE_MAC_UTIL] = resources.mac_computation_utilization
        row[FIELDS.APU_ACTIVATION_UTILIZATION] = resources.apu_activation_utilization

        if self._debug:
            self._set_util_fields(row, resources)
            self._set_power_fields(row, resources)

        row[FIELDS.NODE_LAYER_LATENCY] = self._normalize_time_clk_period(
            resources.node_end_time,
            resources.node_start_time,
        )
        # Used to be estimated_frame_total_cycles. Not removed to not break any direct indices
        row[FIELDS.DEPRECATED] = 0
        if hasattr(hn_layer, "output_features") and resources.subclusters:
            row[FIELDS.OUTPUT_FEATURES_PER_SC] = math.ceil(hn_layer.output_features / resources.subclusters)

        row[FIELDS.APUS] = resources.apus
        row[FIELDS.INPUT_ALIGNERS] = resources.input_aligners
        row[FIELDS.L2_DATA_USAGE] = resources.temporal_contexts
        row.update(self._add_layer_shapes(resources))
        if row[FIELDS.SCS] > 0:
            row[FIELDS.EFFECTIVE_MAC_UTIL] = (
                row[FIELDS.ACTIVE_MAC_UTIL]
                * row[FIELDS.WIDTH_ALIGNMENT_UTIL]
                * row[FIELDS.FEATURE_ALIGNMENT_UTIL]
                * row[FIELDS.BALANCE_FPS_UTIL]
            )
        return row

    def _set_layer_cluster_util_and_fps(self, hn_layer, resources, row):
        if should_ignore_fps(hn_layer):
            return None

        layer_fps = self._calculate_layer_fps(hn_layer, resources)
        row[self.FIELDS.FPS] = f"{layer_fps:.2f}"
        context_min_fps = self._get_context_resources_min_fps(resources)
        if context_min_fps == Estimator.FIELD_NOT_AVAILABLE:
            return Estimator.FIELD_NOT_AVAILABLE
        row[self.FIELDS.SUBCLUSTER_UTILIZATION] = context_min_fps / layer_fps * resources.mac_computation_utilization
        row[self.FIELDS.BALANCE_FPS_UTIL] = context_min_fps / layer_fps
        return layer_fps

    def _get_hn_layer_from_resources(self, resources):
        return self._layers_by_index[resources.layer_index]

    def _calculate_layer_fps(self, hn_layer, resources):
        if hn_layer is None:
            hn_layer = self._get_hn_layer_from_resources(resources)
            if hn_layer is None:
                return None
        if should_ignore_fps(hn_layer) or should_ignore_layer(hn_layer):
            return None
        return self._get_fps(resources.frame_total_cycles, resources.clk_freq)

    def _set_power_fields(self, row, resources):
        POWER_FIELDS = self.POWER_FIELDS
        row[POWER_FIELDS.MAC_COMPUTATION_POWER] = resources.mac_computation_power
        row[POWER_FIELDS.MAC_ACTIVATION_POWER] = resources.mac_activation_power
        row[POWER_FIELDS.TOTAL_WEIGHT_MEM_POWER] = resources.l2_weight_mem_power + resources.l3_weight_mem_power
        row[POWER_FIELDS.L3_WEIGHT_MEM_POWER] = resources.l3_weight_mem_power
        row[POWER_FIELDS.L3_INPUT_READ_MEM_POWER] = resources.l3_input_read_mem_power
        row[POWER_FIELDS.L3_OUTPUT_WRITE_MEM_POWER] = resources.l3_output_write_mem_power
        row[POWER_FIELDS.L2_CONTEXT_READ_MEM_POWER] = resources.l2_context_read_mem_power
        row[POWER_FIELDS.L2_CONTEXT_WRITE_MEM_POWER] = resources.l2_context_write_mem_power
        row[POWER_FIELDS.L2_WEIGHT_MEM_POWER] = resources.l2_weight_mem_power
        row[POWER_FIELDS.APU_ACTIVATION_POWER] = resources.apu_activation_power
        row[POWER_FIELDS.INPUT_ALIGNER_SHIFT_POWER] = resources.input_aligner_shift_power
        row[POWER_FIELDS.LAYER_CONTROLLER_POWER] = resources.layer_controller_power

    def _set_util_fields(self, row, resources):
        UTILIZATION_FIELDS = self.UTILIZATION_FIELDS

        row[UTILIZATION_FIELDS.L3_WEIGHT_MEM_UTILIZATION] = resources.l3_weight_mem_utilization
        row[UTILIZATION_FIELDS.TOTAL_WEIGHT_MEM_UTILIZATION] = (
            resources.l2_weight_mem_utilization + resources.l3_weight_mem_utilization
        )
        row[UTILIZATION_FIELDS.L3_INPUT_READ_MEM_UTILIZATION] = resources.l3_input_read_mem_utilization
        row[UTILIZATION_FIELDS.L3_OUTPUT_WRITE_MEM_UTILIZATION] = resources.l3_output_write_mem_utilization
        row[UTILIZATION_FIELDS.L2_CONTEXT_READ_MEM_UTILIZATION] = resources.l2_context_read_mem_utilization
        row[UTILIZATION_FIELDS.L2_CONTEXT_WRITE_MEM_UTILIZATION] = resources.l2_context_write_mem_utilization
        row[UTILIZATION_FIELDS.L2_WEIGHT_MEM_UTILIZATION] = resources.l2_weight_mem_utilization

        row[UTILIZATION_FIELDS.INPUT_ALIGNER_SHIFT_UTILIZATION] = resources.input_aligner_shift_utilization
        row[UTILIZATION_FIELDS.LAYER_CONTROLLER_UTILIZATION] = resources.layer_controller_utilization
        row[UTILIZATION_FIELDS.MAC_ACTIVATION_UTILIZATION] = resources.mac_activation_utilization

    @staticmethod
    def _combine_field(aggregation_func, type_func, result_row, new_row, key):
        if not new_row[key] and not isinstance(new_row[key], (int, float)):
            return result_row
        if key in result_row:
            old_value = type_func(result_row[key])
            new_value = type_func(new_row[key])
            result_row[key] = aggregation_func([old_value, new_value])
        else:
            result_row[key] = new_row[key]
        return result_row

    @staticmethod
    def _count_scs(rows):
        return sum(int(row[Estimator.FIELDS.SCS]) for row in rows)

    AGGREGATE_UTIL_FIELDS = [
        (FIELDS.MAC_COMPUTATION_UTILIZATION, FIELDS.SCS, _count_scs.__get__(object)),
        (FIELDS.APU_ACTIVATION_UTILIZATION, None, len),
        (FIELDS.SUBCLUSTER_UTILIZATION, FIELDS.SCS, _count_scs.__get__(object)),
    ]

    @staticmethod
    def _combine_util_field(result_row, new_row, key, weight_key, div_factor):
        if not new_row[key] and not isinstance(new_row[key], (int, float)):
            return result_row
        weight = new_row[weight_key] if weight_key else 1
        if key in result_row:
            old_value = float(result_row[key])
            new_value = float(new_row[key])
            result_row[key] = old_value + (new_value * weight / div_factor)
        else:
            result_row[key] = float(new_row[key]) * weight / div_factor
        return result_row

    def _get_combined_layer_names(self):
        combined_layer_names = set()
        for layer in self._hn.nodes:
            sub_layers = layer.sub_layers if layer.op == LayerType.merged_layer else [layer]
            for sub_layer in sub_layers:
                if sub_layer.defuse_name and sub_layer.defuse_type not in [
                    DefuseType.none,
                    DefuseType.portal_l4,
                    DefuseType.l3_portal,
                    DefuseType.portal_ddr,
                    DefuseType.nms,
                ]:
                    combined_layer_names.add(sub_layer.defuse_name)

        return combined_layer_names

    @staticmethod
    def _calculate_combined_loss_factor(rows_to_combine, fields_to_multiply, field_to_ignore):
        assert isinstance(fields_to_multiply, list), "fields_to_multiply must be a list"
        assert not isinstance(field_to_ignore, list), "field_to_ignore must be a single field"
        dividend = 0
        divisor = 0

        for row in rows_to_combine:
            cur_divisor = 1.0
            cur_dividend = 1.0

            skip_row = False
            for field in fields_to_multiply:
                if isinstance(row[field], str):
                    skip_row = True
                    break
                cur_dividend *= row[field]
                if field != field_to_ignore:
                    cur_divisor *= row[field]
            if skip_row:
                continue

            dividend += cur_dividend

            if field_to_ignore in fields_to_multiply:
                divisor += cur_divisor
            elif cur_divisor == 0:
                divisor += 0
            elif row[field_to_ignore] != 0:
                divisor += cur_divisor / row[field_to_ignore]
            elif row[field_to_ignore] == 0:
                raise AssertionError("Cannot divide non-zero number by zero")

        assert not (dividend != 0 and divisor == 0), "Impossible division"
        return 0 if divisor == 0 else 1.0 * dividend / divisor

    def _combine_layers(self, rows):
        # Validate original_hailo_nn is set. If not, just return the rows as is
        if not self._original_hailo_nn:
            self._logger.warning(
                "Tried to combine layers although original_hailo_nn was not set. layers were not combined",
            )
            return rows

        combined_layer_names = self._get_combined_layer_names()
        # Save a backup of the expanded rows for iteration.
        # The deepcopy is important because rows changes during the iteration
        expanded_rows = copy.deepcopy(rows)

        for combined_layer_name in combined_layer_names:
            index_at_list = 0
            rows_to_combine = [
                x
                for x in expanded_rows
                if ("defuse_name" in x and x["defuse_name"] and combined_layer_name == x["defuse_name"])
            ]

            # HACK: if nodes are in different contexts, we don't combine
            if any(row["context"] != rows_to_combine[0]["context"] for row in rows_to_combine):
                continue

            try:
                original_layer = self._original_hailo_nn.get_layer_by_name(combined_layer_name)
            except HailoNNException:
                continue
            row = self._add_network_information_to_layer(original_layer, self._original_hailo_nn)
            row[self.FIELDS.NAME] = self._get_layer_name(original_layer)
            row[self.FIELDS.TYPE] = original_layer.op.name
            row[self.FIELDS.MACS] = 0
            row[self.FIELDS.WEIGHTS] = original_layer.weights
            row[self.FIELDS.DEFUSE_MODE] = f"combined ({rows_to_combine[0][self.FIELDS.DEFUSE_MODE]})"

            for util_key, weight_key, div_factor_func in self.AGGREGATE_UTIL_FIELDS:
                div_factor = div_factor_func(rows_to_combine)
                if div_factor == 0:
                    continue
                for row_to_combine in rows_to_combine:
                    self._combine_util_field(row, row_to_combine, util_key, weight_key, div_factor)

            # For each loss factor, how the total effective_mac_util would have been affected if the loss factor was 1
            # on all combined layers
            for loss_factor_field in [
                self.FIELDS.ACTIVE_MAC_UTIL,
                self.FIELDS.WIDTH_ALIGNMENT_UTIL,
                self.FIELDS.FEATURE_ALIGNMENT_UTIL,
                self.FIELDS.BALANCE_FPS_UTIL,
                self.FIELDS.EFFECTIVE_MAC_UTIL,
            ]:
                row[loss_factor_field] = self._calculate_combined_loss_factor(
                    rows_to_combine,
                    [self.FIELDS.SCS, self.FIELDS.MAC_LAYERS_UTIL, self.FIELDS.EFFECTIVE_MAC_UTIL],
                    loss_factor_field,
                )

            # How many SCs are used for 'real' calculations, % of all SCs of the combined layer
            row[self.FIELDS.MAC_LAYERS_UTIL] = self._calculate_combined_loss_factor(
                rows_to_combine,
                [self.FIELDS.SCS, self.FIELDS.MAC_LAYERS_UTIL],
                self.FIELDS.MAC_LAYERS_UTIL,
            )

            for row_to_combine in rows_to_combine:
                # update fields
                for key, callback in self._fields_to_aggregate.items():
                    callback(row, row_to_combine, key)

                # Remove the combined layer and save the index so combined item would be added at
                # the same location
                index_at_list = rows.index(row_to_combine)

                rows.remove(row_to_combine)
                self.layers -= 1

            rows.insert(index_at_list, row)
            self.layers += 1

        self._combine_predecessors_successors(rows)
        return rows

    def _combine_predecessors_successors(self, rows):
        for row in rows:
            try:
                original_layer = self._original_hailo_nn.get_layer_by_name(row["layer_name"])
                predecessors = [
                    self._get_layer_name(layer) for layer in self._original_hailo_nn.predecessors(original_layer)
                ]
                successors = [
                    self._get_layer_name(layer) for layer in self._original_hailo_nn.successors(original_layer)
                ]
                row[self.FIELDS.PREDECESSORS] = " ".join(predecessors)
                row[self.FIELDS.SUCCESSORS] = " ".join(successors)
            except HailoNNException:
                row[self.FIELDS.PREDECESSORS] = ""
                row[self.FIELDS.SUCCESSORS] = ""

    def _toposort_rows(self, layer_resources):
        try:
            index = self._layers_topological_sort.index(layer_resources[self.FIELDS.NAME])
        except ValueError:
            index = self.MAX_TOPOSORT_INDEX
            if (
                layer_resources[self.FIELDS.NAME]
                in [self._get_layer_name(layer) for layer in self._hn.layers_by_index.values()]
                and self._should_use_logical_layers
            ):
                return index
            self._logger.warning(
                f"Failed to find layer {layer_resources[self.FIELDS.NAME]} in "
                f"physical HN during layers sorting. Placing it at the end.",
            )
        return index

    def _expand_merged_layers(self, rows):
        # Validate original_hailo_nn is set. If not, just return the rows as is
        if not self._original_hailo_nn:
            self._logger.warning(
                "Tried to expand layers although original_hailo_nn was not set. layers were not expanded",
            )
            return rows

        # if the merged_layer contains 'automatic' sub-layers, we only add the merged_layer to the table.
        # else, we delete the merge_layer and add all its sub_layers to the table.
        merged_layers = {x for x in self._hn.nodes if x.op == LayerType.merged_layer}
        for merged_layer in merged_layers:
            expended_rows = []
            merged_row = next(filter(lambda y: y["layer_name"] == self._get_layer_name(merged_layer), rows))
            index_in_list = rows.index(merged_row)
            sub_layers = merged_layer.sub_layers
            sub_layers_num = len(sub_layers)
            for sub_layer in sub_layers:
                row = copy.deepcopy(merged_row)
                self._update_sublayer_info(row, sub_layer, sub_layers_num)
                expended_rows.append(row)

            # delete the merged layer and add the expended layers to the table
            rows.remove(merged_row)
            self.layers -= 1
            for expended_row in expended_rows:
                rows.insert(index_in_list, expended_row)
                index_in_list += 1
                self.layers += 1
        return rows

    def _update_sublayer_info(self, row, sub_layer, sub_layers_num):
        orig_layer_name = sub_layer.defuse_name if sub_layer.defuse_name else sub_layer.name
        try:
            orig_hn_sub_layer = self._original_hailo_nn.get_layer_by_name(orig_layer_name)
        except HailoNNException:
            orig_hn_sub_layer = None
        row = self._add_network_information_to_layer(
            orig_hn_sub_layer, self._original_hailo_nn, row=row, sub_layer=sub_layer
        )
        row[self.FIELDS.NAME] = self._get_layer_name(sub_layer)
        row[self.FIELDS.LAYER_DEFUSE_NAME] = sub_layer.defuse_name
        row[self.FIELDS.TYPE] = sub_layer.op.name
        row[self.FIELDS.MACS] = sub_layer.macs
        row[self.FIELDS.WEIGHTS] = sub_layer.weights
        row[self.FIELDS.SCS] = float(row[self.FIELDS.SCS]) / sub_layers_num
        row[self.FIELDS.APUS] = float(row[self.FIELDS.APUS]) / sub_layers_num
        row[self.FIELDS.INPUT_ALIGNERS] = float(row[self.FIELDS.INPUT_ALIGNERS]) / sub_layers_num
        row[self.FIELDS.LCUS] = float(row[self.FIELDS.LCUS]) / sub_layers_num
        if row[self.FIELDS.POWER] is not None:
            row[self.FIELDS.POWER] = float(row[self.FIELDS.POWER]) / sub_layers_num
        row[self.FIELDS.TOTAL_CYCLES] = row[self.FIELDS.TOTAL_CYCLES] / sub_layers_num
        row[self.FIELDS.L3_OUTPUT_CUTS] = float(row[self.FIELDS.L3_OUTPUT_CUTS]) / sub_layers_num
        row[self.FIELDS.L3_WEIGHT_CUTS] = float(row[self.FIELDS.L3_WEIGHT_CUTS]) / sub_layers_num
        row[self.FIELDS.L2_WEIGHTS] = float(row[self.FIELDS.L2_WEIGHTS]) / sub_layers_num
        theoretical_cycles = self._get_theoretical_cycles(sub_layer, row[self.FIELDS.SCS])
        if theoretical_cycles is not None:
            row[self.FIELDS.THEORETICAL_CYCLES] = theoretical_cycles

        # By doing no change here, we are using the original layer's loss factors.
        # In this way, the total context loss factors remain be the same.

        return row

    def _parse_resources(self):
        rows = []
        for resources in self._graph.resources_list:
            row = self._row(resources)
            if row is not None:
                rows.append(row)

        # If original_hailo_nn is set, combine defused and split merged layers if needed
        if self._original_hailo_nn and self._should_use_logical_layers:
            rows = self._expand_merged_layers(rows)
            rows = self._combine_layers(rows)

        # Sort the rows by the hn stable topological sort
        rows.sort(key=self._toposort_rows)
        self._context_rows = self._calculate_context_loss_factors(rows)
        self._rows = rows

    def _calculate_context_loss_factors(self, rows):
        # Save room for summary row
        result_context_rows = [{field: "" for field in self.PER_CONTEXT_FIELDS.FIELDS_LIST}]
        result_context_rows[0][self.PER_CONTEXT_FIELDS.CONTEXT_NAME] = "contexts_total"

        # Context rows
        available_contexts = list(self._context_min_fps.keys())
        for context_idx, context_name in enumerate(available_contexts):
            context_loss_factors = {field: "" for field in self.PER_CONTEXT_FIELDS.FIELDS_LIST}
            context_loss_factors[self.PER_CONTEXT_FIELDS.CONTEXT_NAME] = context_name
            context_rows = [
                row
                for row in rows
                if (
                    row[self.FIELDS.CONTEXT] == context_name
                    and isinstance(row[self.FIELDS.SCS], (int, float))
                    and row[self.FIELDS.SCS] > 0
                )
            ]

            context_loss_factors[self.PER_CONTEXT_FIELDS.MAPPING_UTIL] = sum(
                row[self.FIELDS.SCS] for row in context_rows
            ) / (self._hw_arch.consts["CORE_PKG::ACTUAL_CLUSTERS"] * self._hw_arch.consts["CLUSTER_UNITS::SUBCLUSTERS"])

            context_loss_factors[self.PER_CONTEXT_FIELDS.LCU_UTIL] = sum(
                row[self.FIELDS.LCUS] for row in context_rows
            ) / (self._hw_arch.consts["CORE_PKG::ACTUAL_CLUSTERS"] * self._hw_arch.consts["CLUSTER_LAYERS::LAYERS"])

            context_loss_factors[self.PER_CONTEXT_FIELDS.L3_UTIL] = (
                sum((row[self.FIELDS.L3_OUTPUT_CUTS] + row[self.FIELDS.L3_WEIGHT_CUTS]) for row in context_rows)
                + self._context_ignored_output_blocks[context_name]
            ) / (self._hw_arch.consts["CORE_PKG::ACTUAL_CLUSTERS"] * self._hw_arch.consts["CLUSTER_UNITS::MEMORY"])

            for loss_factor_field in [
                self.PER_CONTEXT_FIELDS.ACTIVE_MAC_UTIL,
                self.PER_CONTEXT_FIELDS.WIDTH_ALIGNMENT_UTIL,
                self.PER_CONTEXT_FIELDS.FEATURE_ALIGNMENT_UTIL,
                self.PER_CONTEXT_FIELDS.BALANCE_FPS_UTIL,
                self.PER_CONTEXT_FIELDS.EFFECTIVE_MAC_UTIL,
            ]:
                assert (
                    loss_factor_field in self.FIELDS.FIELDS_LIST
                ), "per-context field should have also been a per-layer field"
                context_loss_factors[loss_factor_field] = self._calculate_combined_loss_factor(
                    context_rows,
                    [self.FIELDS.SCS, self.FIELDS.MAC_LAYERS_UTIL, self.FIELDS.EFFECTIVE_MAC_UTIL],
                    loss_factor_field,
                )

            # How many SCs are used for 'real' calculations, % of all SCs of the combined layer
            context_loss_factors[self.PER_CONTEXT_FIELDS.MAC_LAYERS_UTIL] = self._calculate_combined_loss_factor(
                context_rows,
                [self.FIELDS.SCS, self.FIELDS.MAC_LAYERS_UTIL],
                self.FIELDS.MAC_LAYERS_UTIL,
            )
            context_loss_factors[self.PER_CONTEXT_FIELDS.SCS_MAC_UTIL] = (
                context_loss_factors[self.PER_CONTEXT_FIELDS.MAPPING_UTIL]
                * context_loss_factors[self.PER_CONTEXT_FIELDS.MAC_LAYERS_UTIL]
                * context_loss_factors[self.PER_CONTEXT_FIELDS.EFFECTIVE_MAC_UTIL]
            )

            context_loss_factors[self.PER_CONTEXT_FIELDS.FPS] = self._context_min_fps[context_name]
            if self._context_min_fps[context_name] == Estimator.FIELD_NOT_AVAILABLE:
                context_loss_factors[self.PER_CONTEXT_FIELDS.INVERSE_FPS] = Estimator.FIELD_NOT_AVAILABLE
                context_loss_factors[self.PER_CONTEXT_FIELDS.DRAIN] = Estimator.FIELD_NOT_AVAILABLE
                context_loss_factors[self.PER_CONTEXT_FIELDS.LATENCY_UTIL] = 1.0
                context_loss_factors[self.PER_CONTEXT_FIELDS.BATCH8_LATENCY_UTIL] = 1.0
                context_loss_factors[self.PER_CONTEXT_FIELDS.CONTEXT_SWITCH_SCS_MAC_UTIL] = 0.0
                context_loss_factors[self.PER_CONTEXT_FIELDS.BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL] = 0.0
            else:
                context_loss_factors[self.PER_CONTEXT_FIELDS.INVERSE_FPS] = 1.0 / self._context_min_fps[context_name]
                # This 'drain' time is actually the sum of the time it takes to the last layer to start working.
                # The sum of row latencies (or less if smuffers are used). It is in theory equal to the 'parallelism'
                # of the drain time, but in real life because of buffering
                # the drain time is higher (because the first LCU can be done quickly).
                # So it should have called 'SUM_OF_LATENCIES', but it's a long name (:
                # Layers
                # ###############
                #   ###############
                #    ###############
                #        ###############
                #      ###############
                # ^^^^^^^   ==   ^^^^^^^
                # (sum_l)        (drain)
                relevant_rows = [row[self.FIELDS.LATENCY] for row in context_rows]
                if len(relevant_rows) > 0:
                    context_loss_factors[self.PER_CONTEXT_FIELDS.DRAIN] = (
                        max(relevant_rows) - context_loss_factors[self.PER_CONTEXT_FIELDS.INVERSE_FPS]
                    )
                    context_loss_factors[self.PER_CONTEXT_FIELDS.LATENCY_UTIL] = context_loss_factors[
                        self.PER_CONTEXT_FIELDS.INVERSE_FPS
                    ] / (
                        context_loss_factors[self.PER_CONTEXT_FIELDS.INVERSE_FPS]
                        + context_loss_factors[self.PER_CONTEXT_FIELDS.DRAIN]
                    )
                    context_loss_factors[self.PER_CONTEXT_FIELDS.BATCH8_LATENCY_UTIL] = (
                        context_loss_factors[self.PER_CONTEXT_FIELDS.INVERSE_FPS]
                        * 8
                        / (
                            context_loss_factors[self.PER_CONTEXT_FIELDS.INVERSE_FPS] * 8
                            + context_loss_factors[self.PER_CONTEXT_FIELDS.DRAIN]
                        )
                    )
                    context_loss_factors[self.PER_CONTEXT_FIELDS.CONTEXT_SWITCH_SCS_MAC_UTIL] = (
                        context_loss_factors[self.PER_CONTEXT_FIELDS.SCS_MAC_UTIL]
                        * context_loss_factors[self.PER_CONTEXT_FIELDS.LATENCY_UTIL]
                    )
                    context_loss_factors[self.PER_CONTEXT_FIELDS.BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL] = (
                        context_loss_factors[self.PER_CONTEXT_FIELDS.SCS_MAC_UTIL]
                        * context_loss_factors[self.PER_CONTEXT_FIELDS.BATCH8_LATENCY_UTIL]
                    )
                else:
                    context_loss_factors[self.PER_CONTEXT_FIELDS.DRAIN] = 0
                    context_loss_factors[self.PER_CONTEXT_FIELDS.LATENCY_UTIL] = Estimator.FIELD_NOT_AVAILABLE
                    context_loss_factors[self.PER_CONTEXT_FIELDS.BATCH8_LATENCY_UTIL] = Estimator.FIELD_NOT_AVAILABLE
                    context_loss_factors[self.PER_CONTEXT_FIELDS.CONTEXT_SWITCH_SCS_MAC_UTIL] = (
                        Estimator.FIELD_NOT_AVAILABLE
                    )
                    context_loss_factors[self.PER_CONTEXT_FIELDS.BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL] = (
                        Estimator.FIELD_NOT_AVAILABLE
                    )

            context_loss_factors[self.PER_CONTEXT_FIELDS.CONTEXT_OVERHEAD] = self._get_context_overhead(
                available_contexts,
                context_idx,
            )
            if context_idx in self._context_bw:
                context_loss_factors[self.PER_CONTEXT_FIELDS.BOUNDARY_IN] = self._context_bw[context_idx]["boundary_in"]
                context_loss_factors[self.PER_CONTEXT_FIELDS.BOUNDARY_OUT] = self._context_bw[context_idx][
                    "boundary_out"
                ]
                context_loss_factors[self.PER_CONTEXT_FIELDS.INTER_CONTEXT_IN] = self._context_bw[context_idx][
                    "inter_context_in"
                ]
                context_loss_factors[self.PER_CONTEXT_FIELDS.INTER_CONTEXT_OUT] = self._context_bw[context_idx][
                    "inter_context_out"
                ]
                context_loss_factors[self.PER_CONTEXT_FIELDS.DDR_PORTALS_IN] = self._context_bw[context_idx][
                    "ddr_portals_in"
                ]
                context_loss_factors[self.PER_CONTEXT_FIELDS.DDR_PORTALS_OUT] = self._context_bw[context_idx][
                    "ddr_portals_out"
                ]
            else:
                context_loss_factors[self.PER_CONTEXT_FIELDS.BOUNDARY_IN] = Estimator.FIELD_NOT_AVAILABLE
                context_loss_factors[self.PER_CONTEXT_FIELDS.BOUNDARY_OUT] = Estimator.FIELD_NOT_AVAILABLE
                context_loss_factors[self.PER_CONTEXT_FIELDS.INTER_CONTEXT_IN] = Estimator.FIELD_NOT_AVAILABLE
                context_loss_factors[self.PER_CONTEXT_FIELDS.INTER_CONTEXT_OUT] = Estimator.FIELD_NOT_AVAILABLE
                context_loss_factors[self.PER_CONTEXT_FIELDS.DDR_PORTALS_IN] = Estimator.FIELD_NOT_AVAILABLE
                context_loss_factors[self.PER_CONTEXT_FIELDS.DDR_PORTALS_OUT] = Estimator.FIELD_NOT_AVAILABLE

            result_context_rows.append(context_loss_factors)

        # Update summary row
        result_context_rows[0][self.PER_CONTEXT_FIELDS.DRAIN] = Estimator.FIELD_NOT_AVAILABLE
        for loss_factor_field in [
            self.PER_CONTEXT_FIELDS.ACTIVE_MAC_UTIL,
            self.PER_CONTEXT_FIELDS.WIDTH_ALIGNMENT_UTIL,
            self.PER_CONTEXT_FIELDS.FEATURE_ALIGNMENT_UTIL,
            self.PER_CONTEXT_FIELDS.BALANCE_FPS_UTIL,
            self.PER_CONTEXT_FIELDS.MAC_LAYERS_UTIL,
            self.PER_CONTEXT_FIELDS.EFFECTIVE_MAC_UTIL,
            self.PER_CONTEXT_FIELDS.SCS_MAC_UTIL,
            self.PER_CONTEXT_FIELDS.MAPPING_UTIL,
        ]:
            result_context_rows[0][loss_factor_field] = self._calculate_combined_loss_factor(
                result_context_rows[1:],
                [self.PER_CONTEXT_FIELDS.SCS_MAC_UTIL, self.PER_CONTEXT_FIELDS.INVERSE_FPS],
                loss_factor_field,
            )
        drain_values = [row[self.PER_CONTEXT_FIELDS.DRAIN] for row in result_context_rows[1:]]
        infer_values = [row[self.PER_CONTEXT_FIELDS.INVERSE_FPS] for row in result_context_rows[1:]]

        if any(isinstance(field, str) for field in drain_values) or any(
            isinstance(field, str) for field in infer_values
        ):
            result_context_rows[0][self.PER_CONTEXT_FIELDS.LATENCY_UTIL] = Estimator.FIELD_NOT_AVAILABLE
            result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_LATENCY_UTIL] = Estimator.FIELD_NOT_AVAILABLE
            result_context_rows[0][self.PER_CONTEXT_FIELDS.CONTEXT_SWITCH_SCS_MAC_UTIL] = Estimator.FIELD_NOT_AVAILABLE
            result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL] = (
                Estimator.FIELD_NOT_AVAILABLE
            )
        else:
            sum_drain = sum(drain_values)
            sum_infer = sum(infer_values)
            result_context_rows[0][self.PER_CONTEXT_FIELDS.LATENCY_UTIL] = sum_infer / (sum_infer + sum_drain)
            result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_LATENCY_UTIL] = (
                sum_infer * 8 / (sum_infer * 8 + sum_drain)
            )

            result_context_rows[0][self.PER_CONTEXT_FIELDS.CONTEXT_SWITCH_SCS_MAC_UTIL] = (
                result_context_rows[0][self.PER_CONTEXT_FIELDS.SCS_MAC_UTIL]
                * result_context_rows[0][self.PER_CONTEXT_FIELDS.LATENCY_UTIL]
            )

            result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL] = (
                result_context_rows[0][self.PER_CONTEXT_FIELDS.SCS_MAC_UTIL]
                * result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_LATENCY_UTIL]
            )

        bw_fields = [
            self.PER_CONTEXT_FIELDS.BOUNDARY_IN,
            self.PER_CONTEXT_FIELDS.BOUNDARY_OUT,
            self.PER_CONTEXT_FIELDS.INTER_CONTEXT_IN,
            self.PER_CONTEXT_FIELDS.INTER_CONTEXT_OUT,
            self.PER_CONTEXT_FIELDS.DDR_PORTALS_IN,
            self.PER_CONTEXT_FIELDS.DDR_PORTALS_OUT,
        ]

        if any(isinstance(result_context_rows[1][field], str) for field in bw_fields):
            for field in bw_fields:
                result_context_rows[0][field] = Estimator.FIELD_NOT_AVAILABLE
        else:
            for field in bw_fields:
                sum_field = sum(row[field] for row in result_context_rows[1:])
                result_context_rows[0][field] = sum_field

        if self._runtime_data is not None:
            invalid_overhead_info_message = None
            if len(available_contexts) == 1:
                invalid_overhead_info_message = "single context"
            elif self._processed_runtime_data[self.RUNTIME_FIELDS.RUNS][0][self.RUNTIME_FIELDS.BATCH_SIZE] not in [
                -1,
                1,
            ]:
                invalid_overhead_info_message = "no runtime data for batch 1"

            if invalid_overhead_info_message is not None:
                result_context_rows[0][self.PER_CONTEXT_FIELDS.CONTEXT_OVERHEAD] = invalid_overhead_info_message
                result_context_rows[0][self.PER_CONTEXT_FIELDS.OVERHEAD_UTIL] = invalid_overhead_info_message
                result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_OVERHEAD_UTIL] = invalid_overhead_info_message
                result_context_rows[0][self.PER_CONTEXT_FIELDS.TOTAL_MAC_UTIL] = invalid_overhead_info_message
                result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_TOTAL_MAC_UTIL] = invalid_overhead_info_message
                result_context_rows[0][self.PER_CONTEXT_FIELDS.FPS] = invalid_overhead_info_message
                result_context_rows[0][self.PER_CONTEXT_FIELDS.INVERSE_FPS] = invalid_overhead_info_message
                self._mac_util = self._min_fps * self.total_ops_per_frame / self._device_ops_per_second
            else:
                sum_overhead = sum(row[self.PER_CONTEXT_FIELDS.CONTEXT_OVERHEAD] for row in result_context_rows[1:])
                sum_inv_fps = sum(row[self.PER_CONTEXT_FIELDS.INVERSE_FPS] for row in result_context_rows[1:])
                sum_drain = sum(row[self.PER_CONTEXT_FIELDS.DRAIN] for row in result_context_rows[1:])

                result_context_rows[0][self.PER_CONTEXT_FIELDS.CONTEXT_OVERHEAD] = sum_overhead
                result_context_rows[0][self.PER_CONTEXT_FIELDS.OVERHEAD_UTIL] = (sum_inv_fps + sum_drain) / (
                    sum_overhead + sum_inv_fps + sum_drain
                )
                result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_OVERHEAD_UTIL] = (8 * sum_inv_fps + sum_drain) / (
                    sum_overhead + 8 * sum_inv_fps + sum_drain
                )
                result_context_rows[0][self.PER_CONTEXT_FIELDS.TOTAL_MAC_UTIL] = (
                    result_context_rows[0][self.PER_CONTEXT_FIELDS.OVERHEAD_UTIL]
                    * result_context_rows[0][self.PER_CONTEXT_FIELDS.CONTEXT_SWITCH_SCS_MAC_UTIL]
                )
                result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_TOTAL_MAC_UTIL] = (
                    result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_OVERHEAD_UTIL]
                    * result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_CONTEXT_SWITCH_SCS_MAC_UTIL]
                )
                self._mac_util = result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_TOTAL_MAC_UTIL]
                result_context_rows[0][self.PER_CONTEXT_FIELDS.FPS] = self._performance_from_runtime_data[0][
                    self.PERFORMANCE_DETAILS_FIELDS.FPS
                ]
                if result_context_rows[0][self.PER_CONTEXT_FIELDS.FPS] == self.FIELD_NOT_AVAILABLE:
                    result_context_rows[0][self.PER_CONTEXT_FIELDS.INVERSE_FPS] = self.FIELD_NOT_AVAILABLE
                    self._latency_multi_context = self.FIELD_NOT_AVAILABLE
                else:
                    result_context_rows[0][self.PER_CONTEXT_FIELDS.INVERSE_FPS] = (
                        1.0 / result_context_rows[0][self.PER_CONTEXT_FIELDS.FPS]
                    )
                    # in milliseconds
                    self._latency_multi_context = result_context_rows[0][self.PER_CONTEXT_FIELDS.INVERSE_FPS] * 1000

            # TODO: https://hailotech.atlassian.net/browse/SDK-43507
            for run in self._performance_from_runtime_data:
                run[self.PERFORMANCE_DETAILS_FIELDS.MEASURED_MAC_UTIL] = self._mac_util * 100
        else:
            result_context_rows[0][self.PER_CONTEXT_FIELDS.CONTEXT_OVERHEAD] = "no runtime data"
            result_context_rows[0][self.PER_CONTEXT_FIELDS.OVERHEAD_UTIL] = "no runtime data"
            result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_OVERHEAD_UTIL] = "no runtime data"
            result_context_rows[0][self.PER_CONTEXT_FIELDS.TOTAL_MAC_UTIL] = "no runtime data"
            result_context_rows[0][self.PER_CONTEXT_FIELDS.BATCH8_TOTAL_MAC_UTIL] = "no runtime data"
            result_context_rows[0][self.PER_CONTEXT_FIELDS.FPS] = "no runtime data"
            result_context_rows[0][self.PER_CONTEXT_FIELDS.INVERSE_FPS] = "no runtime data"
            self._mac_util = self._min_fps * self.total_ops_per_frame / self._device_ops_per_second

        return result_context_rows

    def _get_context_overhead(self, available_contexts, context_idx):
        if len(available_contexts) == 1:
            return "single context"

        if self._runtime_data is None:
            return "no runtime data"

        first_batch_runtime_data = self._processed_runtime_data[self.RUNTIME_FIELDS.RUNS][0]
        if first_batch_runtime_data[self.RUNTIME_FIELDS.BATCH_SIZE] not in [-1, 1]:
            return "no runtime data for batch 1"

        cur_context = first_batch_runtime_data[self.RUNTIME_FIELDS.CONTEXTS][context_idx]
        cur_context_time = (
            cur_context[self.RUNTIME_FIELDS.CONTEXT_TIME][self.RUNTIME_FIELDS.END]
            - cur_context[self.RUNTIME_FIELDS.CONTEXT_TIME][self.RUNTIME_FIELDS.START]
        )

        if cur_context[self.RUNTIME_FIELDS.LAYERS]:
            cur_context_infer = min(
                val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values()
            ) - min(val[self.RUNTIME_FIELDS.START] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values())

            cur_context_drain = max(
                val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values()
            ) - min(val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values())
        else:
            # Only PP layers in context
            cur_context_infer = 0
            cur_context_drain = 0

        overhead = cur_context_time - (cur_context_infer + cur_context_drain)
        return overhead / self._get_nnm_clk_freq()

    def create_csv(self, csv_path):
        fields_list = copy.deepcopy(self.FIELDS.FIELDS_LIST)
        if self._debug:
            fields_list.extend(self.POWER_FIELDS.FIELDS_LIST)
            fields_list.extend(self.UTILIZATION_FIELDS.FIELDS_LIST)

        with open(csv_path, "w") as csv_file:
            # Write global model data as first row group
            csv_writer = csv.DictWriter(csv_file, self.METADATA_FIELDS.FIELDS_LIST)
            csv_writer.writeheader()
            csv_writer.writerow(self._total_network())
            csv_file.write("\n")

            # Write per-context data as second row group
            csv_writer = csv.DictWriter(csv_file, self.PER_CONTEXT_FIELDS.FIELDS_LIST)
            csv_writer.writeheader()
            for row in self._context_rows:
                csv_writer.writerow(row)
            csv_file.write("\n")

            # Write per-layer data as third row group
            csv_writer = csv.DictWriter(csv_file, fields_list)
            csv_writer.writeheader()
            for row in self._rows:
                csv_writer.writerow(row)

    def _number_of_devices(self):
        consts = self._hw_arch.consts
        subclusters_per_target = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_UNITS::SUBCLUSTERS"]
        l3_cuts_per_target = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_UNITS::MEMORY"]
        layers_per_target = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_LAYERS::LAYERS"]
        return max(
            math.ceil(self.subclusters / subclusters_per_target),
            math.ceil((self.weights_blocks + self.output_blocks) / l3_cuts_per_target),
            math.ceil(self.lcus / layers_per_target),
        )

    def create_log(self):
        report = "\nModel Details\n"
        report += tabulate.tabulate(
            [
                ["Input Tensors Shapes", ", ".join("x".join(map(str, shape)) for shape in self.input_shapes.values())],
                ["Operations per Input Tensor", f"{old_div(self.total_ops_per_frame, 1.0e9):.2f} GOPs"],
                ["Operations per Input Tensor", f"{old_div(self.total_macs_per_frame, 1.0e9):.2f} GMACs"],
                ["Pure Operations per Input Tensor", f"{old_div(self.pure_total_ops_per_frame, 1.0e9):.2f} GOPs"],
                ["Pure Operations per Input Tensor", f"{old_div(self.pure_total_macs_per_frame, 1.0e9):.2f} GMACs"],
                ["Model Parameters", f"{old_div(self.total_weights, 1.0e6):.2f} M"],
            ],
        )

        if self._optimization_goal_fps is None:
            optimization_goal_fps = "Highest"
        else:
            optimization_goal_fps = f"{self._optimization_goal_fps:.2f}"

        devices = 1
        if self._is_multi_context:
            fps, latency, power, ops_per_second, macs_per_second, pure_ops_per_second, pure_macs_per_second = (
                Estimator.FIELD_NOT_AVAILABLE,
            ) * 7
            if self._fps_multi_context not in [None, self.FIELD_NOT_AVAILABLE]:
                fps = f"\t{self._fps_multi_context:.2f} FPS"
                latency = f"{1000 / self._fps_multi_context:.2f} ms"
                ops_per_second = f"{old_div(self.total_ops_per_frame, 1.0e9) * self._fps_multi_context:.2f} GOP/s"
                macs_per_second = f"{old_div(self.total_macs_per_frame, 1.0e9) * self._fps_multi_context:.2f} GMAC/s"
        else:
            latency = f"{self.latency:.2f} ms"
            stream_fps = self._stream_fps if self._stream_fps is not None else self._min_fps
            fps = f"\t{stream_fps:.2f} FPS"
            ops_per_second = f"{old_div(self.total_ops_per_frame, 1.0e9) * stream_fps:.2f} GOP/s"
            macs_per_second = f"{old_div(self.total_macs_per_frame, 1.0e9) * stream_fps:.2f} GMAC/s"

        report += "\n\nProfiler Input Settings\n"
        report += tabulate.tabulate(
            [["Optimization Goal", f"Reach {optimization_goal_fps} FPS"], ["Profiler Mode", "Compiled"]],
        )

        report += "\n\nPerformance Summary\n"

        report += tabulate.tabulate(
            [
                ["Number of Devices", devices],
                ["Number of Contexts", self._number_of_contexts],
                ["Throughput", fps],
                ["Latency", latency],
                ["Operations per Second", ops_per_second],
                ["MACs per Second", macs_per_second],
                ["Total Input Bandwidth", str(human_size_throughput(self.gross_input_throughput))],
                ["Total Output Bandwidth", str(human_size_throughput(self.gross_output_throughput))],
                ["Context Switch Configs", str(human_size_throughput(self._total_configs, units="batch"))],
            ],
        )

        self._logger.info(report)

    def get_stats(self):
        return {
            "model_details": {
                "weights": self.pure_total_weights,
                "total_ops_per_frame": self.pure_total_ops_per_frame,
                "input_shapes": ["x".join([str(dim) for dim in shape]) for shape in self.input_shapes.values()],
                "output_shapes": ["x".join([str(dim) for dim in shape]) for shape in self.output_shapes.values()],
            },
            "performance_details": self._get_performance_details(),
        }

    def _get_performance_details(self):
        FIELDS = self.PERFORMANCE_DETAILS_FIELDS
        if self._runtime_data is not None:
            # Single or multi context, with runtime data (currently only multi context)
            return self._performance_from_runtime_data
        elif self._is_multi_context:
            # Multi context, without runtime data
            return [
                {
                    FIELDS.NUMBER_OF_CONTEXTS: self._number_of_contexts,
                },
            ]
        else:
            # Single context, without runtime data
            stream_fps = self._stream_fps if self._stream_fps is not None else self._min_fps
            return [
                {
                    FIELDS.BATCH_SIZE: self.FIELD_NOT_AVAILABLE,
                    FIELDS.FPS: stream_fps,
                    FIELDS.LATENCY: self.latency,
                    FIELDS.POWER: self._get_total_power() or self.FIELD_NOT_AVAILABLE,
                    FIELDS.NUMBER_OF_CONTEXTS: self._number_of_contexts,
                    FIELDS.INPUT_BW: self.gross_input_throughput,
                    FIELDS.OUTPUT_BW: self.gross_output_throughput,
                    FIELDS.OPS_PER_SECOND: self.total_ops_per_frame * stream_fps,
                    FIELDS.MEASURED_MAC_UTIL: self.FIELD_NOT_AVAILABLE,
                    FIELDS.STREAM_FPS: f"{self._stream_fps:.2f}" if self._stream_fps else self.FIELD_NOT_AVAILABLE,
                },
            ]

    def _calculate_total(self):
        self.total_weights = 0
        self.lcus = 0
        self.layers = 0
        self.total_ops_per_frame = 0.0
        self.total_macs_per_frame = 0.0
        self.total_4bit_macs_per_frame = 0.0
        self.pure_total_ops_per_frame = 0.0
        self.pure_total_macs_per_frame = 0.0
        self.pure_total_weights = 0.0
        # Start the total power calc with the static power
        static_power_at_25C = 150 if self._hw_arch.is_mercury_arch else 350
        self.total_power = static_power_at_25C
        self.subclusters = 0
        self.weights_blocks = 0
        self.output_blocks = 0
        self.latency = 0
        self.input_aligners = 0
        self.apus = 0
        self.l4_cuts = 0
        self.l2_weights = 0
        self.temporal_contexts = 0

        enhanced_layers_indices = set()
        for resources in self._graph.resources_list:
            layer_power = self._power(resources)
            if layer_power > 0:
                if resources.control_type == integrated_hw_graph_base_pb2.PROTO_LCU_UNIT:
                    self.lcus += 1
                    self.layers += 1
                self.total_power += layer_power
            self.subclusters += resources.subclusters
            self.weights_blocks += resources.weights_blocks
            self.output_blocks += resources.output_blocks
            self.latency = max(self.latency, resources.node_end_time / self._clk_freq)
            self.input_aligners += resources.input_aligners
            self.apus += resources.apus
            self.l4_cuts += resources.l4_cuts
            self.l2_weights += resources.l2_weights
            self.temporal_contexts += resources.temporal_contexts
            if resources.is_4bit_enhanced:
                enhanced_layers_indices.add(resources.layer_index)

        self.latency *= 1000  # Latency should be in milliseconds

        hailo_nn = self._hn
        if self._original_hailo_nn:
            hailo_nn = self._original_hailo_nn
        # Note: gather pure statistics for comparison (currently debug only)
        for hn_layer in hailo_nn.nodes:
            self.pure_total_ops_per_frame += hn_layer.ops
            self.pure_total_macs_per_frame += hn_layer.macs
            self.pure_total_weights += hn_layer.weights
            if hn_layer.index in enhanced_layers_indices:
                self.total_4bit_macs_per_frame += hn_layer.macs

        for hn_layer in self._layers_by_index.values():
            self.total_ops_per_frame += hn_layer.ops
            self.total_macs_per_frame += hn_layer.macs
            self.total_weights += hn_layer.weights
            if hn_layer.index in enhanced_layers_indices:
                self.total_4bit_macs_per_frame += hn_layer.macs

        input_layers = self._hn.get_non_const_input_layers()
        self.input_shapes = OrderedDict()
        for input_layer in input_layers:
            self.input_shapes[input_layer.name] = [int(x) for x in input_layer.input_shape[1:]]
        output_layers = self._original_hailo_nn.get_real_output_layers(remove_non_neural_core_layers=False)
        self.output_shapes = OrderedDict()
        for output_layer in output_layers:
            self.output_shapes[output_layer.name] = [int(x) for x in output_layer.output_shapes[0][1:]]

    def _get_fps(self, frame_total_cycles, node_clk_freq):
        if not node_clk_freq:  ## workaround for a new proto field in 3.25, to avoid crush
            node_clk_freq = self._clk_freq
        return node_clk_freq / frame_total_cycles

    def _total_throughput(self, input_throughput=False):
        proto_network_group = self._hef_proto.network_groups[0]
        if len(proto_network_group.ops) == 0:
            if len(proto_network_group.partial_network_groups) > 0:
                # OK to use 0 here, bandwidth is the same for all partial network groups
                partial_network_group_id = 0
                proto_network_group = proto_network_group.partial_network_groups[partial_network_group_id].network_group
            proto_contexts = proto_network_group.contexts
        else:
            core_ops = [op.core_op for op in proto_network_group.ops if op.HasField("core_op")]
            core_op = core_ops[0]
            if len(core_op.partial_core_ops) > 0:
                # OK to use 0 here, bandwidth is the same for all partial network groups
                partial_network_group_id = 0
                core_op = core_op.partial_core_ops[partial_network_group_id].core_op
            proto_contexts = core_op.contexts

        total_boundary_tensors = 0
        total_ddr_tensors = 0
        total_inter_context_tensors = 0
        for idx, proto_context in enumerate(proto_contexts):
            if idx not in self._context_bw:
                self._context_bw[idx] = {}
            boundary_tensors, ddr_tensors, inter_context_tensors = self._calculate_context_throughput(
                idx,
                inputs_only=input_throughput,
                outputs_only=not input_throughput,
                proto=proto_context,
            )
            total_boundary_tensors += boundary_tensors
            total_ddr_tensors += ddr_tensors
            total_inter_context_tensors += inter_context_tensors
            if input_throughput:
                self._context_bw[idx]["boundary_in"] = boundary_tensors
                self._context_bw[idx]["inter_context_in"] = inter_context_tensors
                self._context_bw[idx]["ddr_portals_in"] = ddr_tensors
            else:
                self._context_bw[idx]["boundary_out"] = boundary_tensors
                self._context_bw[idx]["inter_context_out"] = inter_context_tensors
                self._context_bw[idx]["ddr_portals_out"] = ddr_tensors

        return total_boundary_tensors, total_boundary_tensors + total_ddr_tensors + total_inter_context_tensors

    def _total_network(self):
        row = {field: "" for field in self.METADATA_FIELDS.FIELDS_LIST}
        FIELDS = self.METADATA_FIELDS

        consts = self._hw_arch.consts
        l3_cut_size = consts["CLUSTER_MEM::D_VIR_L3_MEMORY"] * consts["CLUSTER_MEM::W_VIR_L3_MEMORY"] / 8
        l2_cut_size = consts["SUBCLUSTER_RESOURCES::D_L2_MEMORY"] * consts["SUBCLUSTER_RESOURCES::W_L2_MEMORY"] / 8
        l4_cut_size = consts["PREPOST_CLUSTER::D_VIR_L4_MEMORY"] * consts["PREPOST_CLUSTER::W_VIR_L4_MEMORY"] / 8
        l2_cuts_per_target = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_UNITS::SUBCLUSTERS"]
        l3_cuts_per_target = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_UNITS::MEMORY"]
        l4_cuts_per_target = consts["PREPOST_CLUSTER::L4_MEMORY_CUTS"]

        l3_to_l2_ratio = old_div(l3_cut_size, l2_cut_size)

        row[FIELDS.MODEL_NAME] = self._hn.name
        row[FIELDS.HW_ARCH] = self._hw_arch.name
        row[FIELDS.SUBCLUSTERS] = self.subclusters
        row[FIELDS.L3_WEIGHT_CUTS] = self.weights_blocks
        row[FIELDS.L3_OUTPUT_CUTS] = self.output_blocks
        row[FIELDS.L2_WEIGHTS] = self.l2_weights
        row[FIELDS.L3_TOTAL_CUTS] = row[FIELDS.L3_WEIGHT_CUTS] + row[FIELDS.L3_OUTPUT_CUTS]
        row[FIELDS.LCUS] = self.lcus
        row[FIELDS.LAYERS] = self.layers
        row[FIELDS.WEIGHTS] = self.pure_total_weights
        row[FIELDS.OPS_PER_IMAGE] = self.total_ops_per_frame
        row[FIELDS.MACS_PER_IMAGE] = self.total_macs_per_frame
        row[FIELDS.TOTAL_4BIT_MACS_PER_FRAME] = self.total_4bit_macs_per_frame
        row[FIELDS.PURE_OPS_PER_IMAGE] = self.pure_total_ops_per_frame
        row[FIELDS.PURE_MACS_PER_IMAGE] = self.pure_total_macs_per_frame
        row[FIELDS.POWER] = self._get_total_power()
        row[FIELDS.FPS] = f"{self._min_fps:.2f}" if not self._is_multi_context else Estimator.FIELD_NOT_AVAILABLE
        row[FIELDS.STREAM_FPS] = f"{self._stream_fps:.2f}" if self._stream_fps else None
        row[FIELDS.OPTIMIZATION_GOAL_FPS] = (
            f"{self._optimization_goal_fps:.2f}" if self._optimization_goal_fps else None
        )
        row[FIELDS.CONTEXT_SWITCH_CONFIGS] = f"{self._total_configs:.2f}" if self._total_configs else None
        row[FIELDS.LATENCY] = f"{self.latency:.2f}" if not self._is_multi_context else Estimator.FIELD_NOT_AVAILABLE
        row[FIELDS.SUBCLUSTERS_PER_TARGET] = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_UNITS::SUBCLUSTERS"]
        row[FIELDS.L3_CUTS_PER_TARGET] = l3_cuts_per_target
        row[FIELDS.L2_CUTS_PER_TARGET_NORMALIZED] = old_div(row[FIELDS.SUBCLUSTERS_PER_TARGET], l3_to_l2_ratio)
        row[FIELDS.L2_CUTS_USED_NORMALIZED] = old_div(row[FIELDS.SUBCLUSTERS], l3_to_l2_ratio)
        row[FIELDS.LAYERS_PER_TARGET] = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_LAYERS::LAYERS"]
        # Converting the value from being snake_cased, so it will be more readable
        row[FIELDS.PROFILING_MODE] = "Post Placement" if self._profiling_mode == "Compiled" else self._profiling_mode
        row[FIELDS.NET_INPUT_THROUGHPUT] = str(human_size_throughput(self.net_input_throughput))
        row[FIELDS.GROSS_INPUT_THROUGHPUT] = str(human_size_throughput(self.gross_input_throughput))
        row[FIELDS.NET_OUTPUT_THROUGHPUT] = str(human_size_throughput(self.net_output_throughput))
        row[FIELDS.GROSS_OUTPUT_THROUGHPUT] = str(human_size_throughput(self.gross_output_throughput))
        row[FIELDS.INPUT_ALIGNERS] = self.input_aligners
        input_aligners_per_target = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_UNITS::INPUT_ALIGNERS"]
        row[FIELDS.INPUT_ALIGNERS_PER_TARGET] = input_aligners_per_target
        row[FIELDS.APUS] = self.apus
        row[FIELDS.APUS_PER_TARGET] = consts["CORE_PKG::ACTUAL_CLUSTERS"] * consts["CLUSTER_UNITS::APUS"]
        row[FIELDS.L4_CUTS] = self.l4_cuts
        row[FIELDS.L4_CUTS_PER_TARGET] = l4_cuts_per_target
        row[FIELDS.L2_WEIGHTS] = self.l2_weights
        row[FIELDS.L2_DATA_USAGE] = self.temporal_contexts
        row[FIELDS.L2_CUTS_PER_TARGET] = l2_cuts_per_target
        row[FIELDS.NUMBER_OF_DEVICES] = self._number_of_devices()
        row[FIELDS.NUMBER_OF_CONTEXTS] = self._number_of_contexts
        row[FIELDS.TRANSPOSE] = self._get_transpose()
        row[FIELDS.RESIZE_INPUT] = len(self._get_resize()) > 0
        row[FIELDS.NORMALIZATION] = self._get_normalization()
        row[FIELDS.INPUT_CONVERSION] = len(self._get_input_conversion_type()) > 0
        row[FIELDS.NMS] = len([x for x in self._get_post_processing() if "NMS" in x]) > 0
        row[FIELDS.OPTIMIZATION_LEVEL] = self._get_optimization_level()
        row[FIELDS.COMPRESSION_LEVEL] = self._get_compression_level()
        row[FIELDS.COMPRESSION_RATE] = self._get_compression_rate()
        row[FIELDS.CALIBRATION] = self._get_calibration_size()
        row[FIELDS.L2_CUT_SIZE] = l2_cut_size
        row[FIELDS.L3_CUT_SIZE] = l3_cut_size
        row[FIELDS.L4_CUT_SIZE] = l4_cut_size

        return row

    def _get_total_power(self):
        if self._hw_arch.does_support_power_profiling and not self._is_multi_context:
            return f"{self.total_power:.2f}"

    def _get_normalization(self):
        # Boolean
        return SupportedCommands.NORMALIZATION.value in self._modification_commands

    def _get_resize(self):
        # Get all input resizes as a list. If no resizes, the list will be empty
        resize_list = []
        for command in self._modification_commands:
            if command == SupportedCommands.RESIZE.value:
                input_shapes_str = "x".join([str(x) for x in self._modification_commands[command].input_shapes])
                resize_list.append(input_shapes_str)

        return resize_list

    def _get_transpose(self):
        # Boolean
        return SupportedCommands.TRANSPOSE.value in self._modification_commands

    def _get_input_conversion_type(self):
        # Get all input conversions as a list. If no conversions, the list will be empty
        input_conversions = []
        for command in self._modification_commands:
            if command == SupportedCommands.INPUT_CONVERSION.value:
                input_conversions.append(self._modification_commands[command].conversion_type.value)
        return input_conversions

    def _get_post_processing(self):
        post_processing = []

        for command in self._modification_commands:
            if command == SupportedCommands.NMS_POSTPROCESS.value:
                post_processing.append(self._modification_commands[command].meta_arch.value + " NMS")
            elif command == SupportedCommands.CHANGE_OUTPUT_ACTIVATION.value:
                post_processing.append(str(self._modification_commands[command].activation_type))
            elif command == SupportedCommands.LOGITS_LAYER.value:
                post_processing.append(str(self._modification_commands[command].activation_type.value))

        return post_processing

    def _get_compression_level(self):
        return self._mo_flavor.compression_level if self._mo_flavor else Estimator.FIELD_NOT_AVAILABLE

    def _get_optimization_level(self):
        return self._mo_flavor.optimization_level if self._mo_flavor else Estimator.FIELD_NOT_AVAILABLE

    def _get_calibration_size(self):
        if "calibration" in self._optimization_commands:
            if "calibset_size" in self._optimization_commands["calibration"]:
                return self._optimization_commands["calibration"]["calibset_size"]

        return CalibrationConfig.get_default().calibset_size

    def _get_compression_rate(self):
        if ModelOptimizationCommand.compression_params.value in self._flavor_config:
            return self._flavor_config[ModelOptimizationCommand.compression_params.value]["auto_4bit_weights_ratio"]

        return Estimator.FIELD_NOT_AVAILABLE

    def _lcu_key(self, lcu_action):
        FIELDS = self.RUNTIME_FIELDS
        return str((lcu_action[FIELDS.DATA][FIELDS.CLUSTER_INDEX], lcu_action[FIELDS.DATA]["lcu_index"]))

    def _collect_bursts_sizes(self, proto_operations):
        burst_sizes = {}
        for operation in proto_operations:
            for action in operation.actions:
                if action.HasField("write_data_ccw_ptr"):
                    ccw_action = action.write_data_ccw_ptr
                    if not ccw_action.size:
                        continue
                    data_size = ccw_action.size
                elif action.HasField("write_data_ccw"):
                    ccw_action = action.write_data_ccw
                    if not ccw_action.data:
                        continue
                    data_size = len(action.write_data_ccw.data)
                else:
                    continue

                channel_idx = ccw_action.cfg_channel_index
                if channel_idx not in burst_sizes:
                    burst_sizes[channel_idx] = []

                burst_sizes[channel_idx].append(data_size)
        return burst_sizes

    def _calculate_context_throughput(self, context_idx, inputs_only=False, outputs_only=False, proto=None):
        relevant_context = self._profiler_data.context[context_idx]
        if not relevant_context.HasField("bandwidth"):
            raise ProfilingException(
                "Profiler does not support compiled files that were created using Dataflow Compiler <= 3.25.",
            )
        bandwidth = relevant_context.bandwidth
        if inputs_only:
            return bandwidth.input_boundary, bandwidth.input_ddr, bandwidth.input_inter_context
        elif outputs_only:
            return bandwidth.output_boundary, bandwidth.output_ddr, bandwidth.output_inter_context
        else:
            return (
                bandwidth.input_boundary + bandwidth.output_boundary,
                bandwidth.input_ddr + bandwidth.output_ddr,
                bandwidth.input_inter_context + bandwidth.output_inter_context,
            )

    def _get_partial_network_group_id(self, raw_network_group):
        # TODO SDK-46602: Get it from runtime json
        return 0

    def _copy_checkpoints(self, raw_network_group, proto_network_group):
        is_mercury = self._hw_arch.is_mercury_arch
        DESCRIPTOR_SIZE = 512 if not is_mercury else 1  # Bytes
        FIELDS = self.RUNTIME_FIELDS
        network_group = {FIELDS.CONTEXTS: []}

        # in case its -1 on runtime json which is the default for legacy hailortcli run (not run2)
        batch_size = max(1, raw_network_group.get(FIELDS.BATCH_SIZE, 1))
        network_group[FIELDS.BATCH_SIZE] = batch_size

        if len(proto_network_group.ops) == 0:
            if len(proto_network_group.partial_network_groups) > 0:
                partial_network_group_id = self._get_partial_network_group_id(raw_network_group)
                proto_network_group = proto_network_group.partial_network_groups[partial_network_group_id].network_group
            proto_contexts = proto_network_group.contexts
            preliminary_config = proto_network_group.preliminary_config
        else:
            core_ops = [op.core_op for op in proto_network_group.ops if op.HasField("core_op")]
            core_op = core_ops[0]
            if len(core_op.partial_core_ops) > 0:
                partial_network_group_id = self._get_partial_network_group_id(raw_network_group)
                core_op = core_op.partial_core_ops[partial_network_group_id].core_op
            proto_contexts = core_op.contexts
            preliminary_config = core_op.preliminary_config

        gross_input_throughput = 0
        gross_output_throughput = 0
        gross_configs = 0
        for context_index, raw_context in enumerate(raw_network_group[FIELDS.CONTEXTS]):
            lcus = {}
            sequencer = {}
            configurations = {}
            overhead = {}
            lcu_start = 0
            total_configs = 0
            context_time = {}
            min_time = None
            max_time = None
            actions = raw_context[FIELDS.ACTIONS]
            if context_index > 0:
                proto_operations = proto_contexts[context_index - 1].operations
            else:
                proto_operations = preliminary_config.operation
            bursts_sizes = self._collect_bursts_sizes(proto_operations)
            burst_sizes_indices = {}
            module_config_done = False
            for raw_action in actions:
                if raw_action[FIELDS.TYPE] == FIELDS.LCU_DONE:
                    lcus[self._lcu_key(raw_action)] = {}
                    lcus[self._lcu_key(raw_action)][FIELDS.START] = lcu_start
                    lcus[self._lcu_key(raw_action)][FIELDS.END] = raw_action[FIELDS.TIMESTAMP]
                if raw_action[FIELDS.TYPE] == FIELDS.FETCH_DESCRIPTORS:
                    if is_mercury:
                        burst_count = raw_action[FIELDS.DATA][FIELDS.CCW_BURSTS]
                        config_channel = raw_action[FIELDS.DATA][FIELDS.CONFIG_STREAM_INDEX]
                        if config_channel not in burst_sizes_indices:
                            burst_sizes_indices[config_channel] = 0
                        current_burst_index = burst_sizes_indices[config_channel]
                        assert (
                            len(bursts_sizes) > 0
                            and len(bursts_sizes[config_channel]) >= current_burst_index + burst_count
                        ), "CCW configurations from hef proto doesn't match runtime data."
                        total_configs += sum(
                            bursts_sizes[config_channel][current_burst_index : current_burst_index + burst_count],
                        )
                        burst_sizes_indices[config_channel] += burst_count
                    else:
                        total_configs += raw_action[FIELDS.DATA][FIELDS.DESCRIPTORS_COUNT]
                    if len(configurations) == 0:
                        configurations[FIELDS.START] = raw_action[FIELDS.TIMESTAMP]
                if raw_action[FIELDS.TYPE] == FIELDS.MODULE_CONFIG_DONE and raw_action[FIELDS.DATA][
                    FIELDS.MODULE_INDEX
                ] in [13, 14]:
                    module_config_done = True
                    configurations[FIELDS.END] = raw_action[FIELDS.TIMESTAMP]
                    overhead[FIELDS.START] = raw_action[FIELDS.TIMESTAMP]
                if raw_action[FIELDS.TYPE] == FIELDS.SEQUENCER_START:
                    sequencer[raw_action[FIELDS.DATA][FIELDS.CLUSTER_INDEX]] = {
                        FIELDS.START: raw_action[FIELDS.TIMESTAMP],
                        FIELDS.END: None,
                    }
                if raw_action[FIELDS.TYPE] == FIELDS.SEQUENCER_DONE and not module_config_done:
                    if context_index == 0:
                        # Currently we don't display sequencers for preliminary context, so it doesnt really matter what we put here
                        # Ideally, we should put here the time on which we fetch the configs that will lead to sequencer start.
                        sequencer[raw_action[FIELDS.DATA][FIELDS.SEQUENCER_INDEX]] = {
                            FIELDS.START: 0,
                            FIELDS.END: None,
                        }
                    sequencer[raw_action[FIELDS.DATA][FIELDS.SEQUENCER_INDEX]][FIELDS.END] = raw_action[
                        FIELDS.TIMESTAMP
                    ]
                if raw_action[FIELDS.TYPE] in FIELDS.ACTIVATE_INPUT:
                    if len(network_group[FIELDS.CONTEXTS]) > 0:
                        affected_overhead = network_group[FIELDS.CONTEXTS][-1][FIELDS.OVERHEAD]
                    else:
                        affected_overhead = overhead
                    affected_overhead[FIELDS.END] = raw_action[FIELDS.TIMESTAMP]
                    lcu_start = raw_action[FIELDS.TIMESTAMP]
                min_time = (
                    raw_action[FIELDS.TIMESTAMP]
                    if min_time is None or raw_action[FIELDS.TIMESTAMP] < min_time
                    else min_time
                )
                max_time = (
                    raw_action[FIELDS.TIMESTAMP]
                    if max_time is None or raw_action[FIELDS.TIMESTAMP] > max_time
                    else max_time
                )

            context_time[FIELDS.START] = min_time
            context_time[FIELDS.END] = max_time

            # calc FPS
            latency = (actions[-1][FIELDS.TIMESTAMP] - actions[0][FIELDS.TIMESTAMP]) / self._get_nnm_clk_freq()
            fps_by_latency = 1 / latency if latency else 0
            if len(raw_network_group[FIELDS.CONTEXTS]) > 2:  # multi-context
                fps = fps_by_latency
            else:
                fps = self._min_fps
            total_ddr = 0
            total_inter_context = 0
            total_boundaries = 0
            if context_index > 0:
                proto_context = proto_contexts[context_index - 1]
                total_boundaries, total_ddr, total_inter_context = self._calculate_context_throughput(
                    context_index - 1,
                    proto=proto_context,
                )

                boundary_tensors, ddr_tensors, inter_context_tensors = self._calculate_context_throughput(
                    context_index - 1,
                    inputs_only=True,
                    proto=proto_context,
                )
                gross_input_throughput += boundary_tensors + ddr_tensors + inter_context_tensors

                boundary_tensors, ddr_tensors, inter_context_tensors = self._calculate_context_throughput(
                    context_index - 1,
                    outputs_only=True,
                    proto=proto_context,
                )
                gross_output_throughput += boundary_tensors + ddr_tensors + inter_context_tensors

                total_configs *= DESCRIPTOR_SIZE
                gross_configs += total_configs
            # to Gb/s
            total_configs *= 2 ** (-30) * 8 * fps_by_latency
            total_inter_context *= 2 ** (-30) * 8 * fps
            total_boundaries *= 2 ** (-30) * 8 * fps
            total_ddr *= 2 ** (-30) * 8 * fps
            bandwidth = {
                FIELDS.CONFIGS_BW: total_configs,
                FIELDS.DDR_BW: total_ddr,
                FIELDS.INTER_CONTEXT_BW: total_inter_context,
                FIELDS.BOUNDARIES_BW: total_boundaries,
            }
            context = {
                FIELDS.LAYERS: lcus,
                FIELDS.CONFIGURATIONS: configurations,
                FIELDS.SEQUENCER: sequencer,
                FIELDS.OVERHEAD: overhead,
                FIELDS.BW: bandwidth,
                FIELDS.CONTEXT_TIME: context_time,
            }
            # Validation
            for field in [FIELDS.CONFIGURATIONS, FIELDS.LAYERS, FIELDS.SEQUENCER, FIELDS.CONTEXT_TIME]:
                if FIELDS.START in context[field]:
                    if context[field][FIELDS.END] < context[field][FIELDS.START]:
                        raise ProfilingException(
                            "\nRuntime data file has some incorrect data. "
                            + "Make sure to use the HailoRT version that corresponds to this Dataflow Compiler version. "
                            + f"First error is on batch {batch_size}, context {context_index}, {field}: end={context[field][FIELDS.END]} comes before start={context[field][FIELDS.START]}",
                        )
                else:
                    for value in context[field]:
                        if context[field][value][FIELDS.END] < context[field][value][FIELDS.START]:
                            raise ProfilingException(
                                "\nRuntime data file has some incorrect data. "
                                + "Make sure to use the HailoRT version that corresponds to this Dataflow Compiler version. "
                                + f"First error is on batch {batch_size}, context {context_index}, {field}, {value}: end={context[field][value][FIELDS.END]} comes before start={context[field][value][FIELDS.START]}",
                            )
            network_group[FIELDS.CONTEXTS].append(context)

        # Take FPS from runtime data if it exists
        # Actually, even if FPS exists on runtime data, it could be -1. So for this case, or for when the fps doesn't exists, we'll get -1
        fps_from_runtime = raw_network_group.get(self.PERFORMANCE_DETAILS_FIELDS.FPS, -1)
        if not self._total_configs:
            self._total_configs = gross_configs

        # Fix first overhead
        if len(raw_network_group[FIELDS.CONTEXTS]) > 2:  # multi-context
            time_first_context_begins = network_group[FIELDS.CONTEXTS][1][FIELDS.CONTEXT_TIME][FIELDS.START]
            time_first_context_gets_input = network_group[FIELDS.CONTEXTS][0][FIELDS.OVERHEAD][FIELDS.END]
            time_last_context_overhead_begins = network_group[FIELDS.CONTEXTS][-1][FIELDS.OVERHEAD][FIELDS.START]
            time_last_context_ends = network_group[FIELDS.CONTEXTS][-1][FIELDS.CONTEXT_TIME][FIELDS.END]
            network_group[FIELDS.CONTEXTS][-1][FIELDS.OVERHEAD][FIELDS.END] = (
                network_group[FIELDS.CONTEXTS][-1][FIELDS.OVERHEAD][FIELDS.START]
                + (time_last_context_ends - time_last_context_overhead_begins)
                + (time_first_context_gets_input - time_first_context_begins)
            )

            # Final Latency, FPS
            if fps_from_runtime != -1:
                total_fps = fps_from_runtime
                total_latency = batch_size / fps_from_runtime
            else:
                if time_last_context_ends == 0 and time_first_context_begins == 0:
                    self._logger.warning("Runtime data is partial.. failed calculation of runtime latency and FPS.")
                    total_latency = self.FIELD_NOT_AVAILABLE
                    total_fps = self.FIELD_NOT_AVAILABLE
                else:
                    total_latency = (time_last_context_ends - time_first_context_begins) / self._get_nnm_clk_freq()
                    total_fps = batch_size / total_latency
            total_power = self.FIELD_NOT_AVAILABLE
        else:
            total_latency = self.latency
            total_fps = fps_from_runtime if fps_from_runtime != -1 else self._min_fps
            total_power = self._get_total_power() or self.FIELD_NOT_AVAILABLE
        if total_fps != self.FIELD_NOT_AVAILABLE:
            input_bw = gross_input_throughput * total_fps
            output_bw = gross_output_throughput * total_fps
            ops_per_second = self.total_ops_per_frame * total_fps
        else:
            input_bw = self.FIELD_NOT_AVAILABLE
            output_bw = self.FIELD_NOT_AVAILABLE
            ops_per_second = self.FIELD_NOT_AVAILABLE
        PERFORMANCE_FIELDS = self.PERFORMANCE_DETAILS_FIELDS
        performance_details = {
            PERFORMANCE_FIELDS.BATCH_SIZE: batch_size,
            PERFORMANCE_FIELDS.FPS: total_fps,
            PERFORMANCE_FIELDS.LATENCY: total_latency * 1000,  # specifically this fields is in ms
            PERFORMANCE_FIELDS.POWER: total_power,
            PERFORMANCE_FIELDS.NUMBER_OF_CONTEXTS: self._number_of_contexts,
            PERFORMANCE_FIELDS.INPUT_BW: input_bw,
            PERFORMANCE_FIELDS.OUTPUT_BW: output_bw,
            PERFORMANCE_FIELDS.OPS_PER_SECOND: ops_per_second,
            PERFORMANCE_FIELDS.MEASURED_MAC_UTIL: self.FIELD_NOT_AVAILABLE,
            PERFORMANCE_FIELDS.STREAM_FPS: self.FIELD_NOT_AVAILABLE,
        }

        return network_group, performance_details

    def _move_first_context_configs(self, network_group, network_group_orig):
        # The Runtime Profiler displays 'contexts' as: first configurations, then sequencer, overhead, and lastly the
        # layers.
        # Since the hailort json file groups the next context's configs, sequencer data with the previous infer, we are
        # doing the following data movements:
        # - Remove the preliminary context
        # - For the first context: leave the layers (infer) as they are, but use the last context's configs, seq,
        # overhead, bw data.
        # - For the second to last contexts: leave the layers as they are, but copy the previous context's configs, seq,
        # overhead, bw data.
        # - Create a 'phantom' context to display at the end, to show that after the last context the first context
        # returns

        FIELDS = self.RUNTIME_FIELDS

        # Remove preliminary
        network_group[FIELDS.PRELIMINARY] = network_group[FIELDS.CONTEXTS][0]
        network_group[FIELDS.PRELIMINARY][FIELDS.LABEL] = "preliminary"
        # remove overhead as we don't have information of the first frame
        network_group[FIELDS.PRELIMINARY][FIELDS.OVERHEAD] = {}
        network_group[FIELDS.CONTEXTS] = network_group[FIELDS.CONTEXTS][1:]
        last_index = len(network_group_orig[FIELDS.CONTEXTS]) - 1
        network_group[FIELDS.CONTEXTS][0][FIELDS.LABEL] = "context_0"

        if len(network_group[FIELDS.CONTEXTS]) == 1:
            # Single context
            network_group[FIELDS.CONTEXTS][0] = {
                FIELDS.LABEL: network_group[FIELDS.CONTEXTS][0][FIELDS.LABEL],
                FIELDS.BW: network_group[FIELDS.CONTEXTS][0][FIELDS.BW],
            }
            return network_group

        last_index_context = network_group_orig[FIELDS.CONTEXTS][last_index]
        network_group[FIELDS.CONTEXTS][0][FIELDS.CONFIGURATIONS] = last_index_context[FIELDS.CONFIGURATIONS]
        network_group[FIELDS.CONTEXTS][0][FIELDS.SEQUENCER] = last_index_context[FIELDS.SEQUENCER]
        network_group[FIELDS.CONTEXTS][0][FIELDS.OVERHEAD] = last_index_context[FIELDS.OVERHEAD]
        # We currently define 'context' in a bad way: [prev_configs, prev_seq, prev_overhead, prev_bandwidth,
        #                                              curr_layers].
        # This representation was chosen because actually the configs fetched during the previous context belong to the
        # current one.
        # But.. the analogy breaks when talking about bandwidth, which is the sum of ddr portals, boundary in/outs and
        # inter context in/outs.
        # When context_0 is actually running on device, it uses the corresponding pcie-bw.
        # Therefore, we'll keep the indices of the bandwidth values.
        network_group[FIELDS.CONTEXTS][0][FIELDS.BW] = network_group_orig[FIELDS.CONTEXTS][1][FIELDS.BW]

        # Create the new phantom context
        phantom = {
            FIELDS.LABEL: "context_0",
            FIELDS.CONFIGURATIONS: copy.deepcopy(
                network_group_orig[FIELDS.CONTEXTS][last_index][FIELDS.CONFIGURATIONS],
            ),
            FIELDS.SEQUENCER: copy.deepcopy(network_group_orig[FIELDS.CONTEXTS][last_index][FIELDS.SEQUENCER]),
            FIELDS.OVERHEAD: copy.deepcopy(network_group_orig[FIELDS.CONTEXTS][last_index][FIELDS.OVERHEAD]),
            FIELDS.CONFIGS_BW: 0,
            FIELDS.LAYERS: {},
        }

        for context_index in range(1, len(network_group[FIELDS.CONTEXTS])):
            orig_curr_context = network_group_orig[FIELDS.CONTEXTS][context_index]
            orig_next_context = network_group_orig[FIELDS.CONTEXTS][context_index + 1]
            network_group[FIELDS.CONTEXTS][context_index][FIELDS.LABEL] = "context_" + str(context_index)
            # Copy the next context config to the current config
            network_group[FIELDS.CONTEXTS][context_index][FIELDS.CONFIGURATIONS] = orig_curr_context[
                FIELDS.CONFIGURATIONS
            ]
            network_group[FIELDS.CONTEXTS][context_index][FIELDS.SEQUENCER] = orig_curr_context[FIELDS.SEQUENCER]
            network_group[FIELDS.CONTEXTS][context_index][FIELDS.OVERHEAD] = orig_curr_context[FIELDS.OVERHEAD]
            # On bandwidth, we'll use 1:1 copy
            network_group[FIELDS.CONTEXTS][context_index][FIELDS.BW] = orig_next_context[FIELDS.BW]

        # Cut first context configurations to start from actual start
        actual_start = list(network_group[FIELDS.CONTEXTS][-1][FIELDS.LAYERS].values())[-1][FIELDS.END]
        network_group[FIELDS.CONTEXTS][0][FIELDS.CONFIGURATIONS][FIELDS.START] = actual_start

        # Calculate the shift time
        start_time = next(iter(network_group[FIELDS.CONTEXTS][0][FIELDS.LAYERS].values()))[FIELDS.START]
        max_time = max(
            network_group[FIELDS.CONTEXTS][0][FIELDS.CONFIGURATIONS][FIELDS.END],
            list(network_group[FIELDS.CONTEXTS][0][FIELDS.SEQUENCER].values())[-1][FIELDS.END],
            list(network_group[FIELDS.CONTEXTS][0][FIELDS.LAYERS].values())[-1][FIELDS.END],
        )

        network_group[FIELDS.CONTEXTS].append(phantom)

        # Shift first context configurations to end exactly at start time
        max_time -= start_time
        network_group[FIELDS.CONTEXTS][0][FIELDS.CONFIGURATIONS][FIELDS.START] -= max_time
        network_group[FIELDS.CONTEXTS][0][FIELDS.CONFIGURATIONS][FIELDS.END] -= max_time
        network_group[FIELDS.CONTEXTS][0][FIELDS.OVERHEAD][FIELDS.START] -= max_time
        network_group[FIELDS.CONTEXTS][0][FIELDS.OVERHEAD][FIELDS.END] -= max_time
        for seq in network_group[FIELDS.CONTEXTS][0][FIELDS.SEQUENCER].values():
            seq[FIELDS.START] -= max_time
            seq[FIELDS.END] -= max_time

        network_group[FIELDS.RUNTIME_PERCENTAGES] = self._get_runtime_percentages(network_group)

    def _get_runtime_percentages(self, network_group):
        FIELDS = self.RUNTIME_FIELDS
        # Compute values to show on HTML profiler
        total_infer = 0
        total_drain = 0
        total_sequencers = 0
        total_remain_configs = 0
        total_remain_overhead = 0
        total_bw_configs = 0
        total_bw_ddr = 0
        total_bw_inter = 0
        total_bw_boundary = 0
        # For all contexts, except the last 'phantom' context
        for context_index in range(len(network_group[FIELDS.CONTEXTS]) - 1):
            cur_context = network_group[FIELDS.CONTEXTS][context_index]

            if cur_context[self.RUNTIME_FIELDS.LAYERS]:
                cur_context_infer = min(
                    val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values()
                ) - min(val[self.RUNTIME_FIELDS.START] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values())

                cur_context_drain = max(
                    val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values()
                ) - min(val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.LAYERS].values())
            else:
                # Only PP layers in context
                cur_context_infer = 0
                cur_context_drain = 0

            cur_context_seq = max(
                val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.SEQUENCER].values()
            ) - min(val[self.RUNTIME_FIELDS.START] for val in cur_context[self.RUNTIME_FIELDS.SEQUENCER].values())

            cur_context_remaining_configs = cur_context[self.RUNTIME_FIELDS.CONFIGURATIONS][
                self.RUNTIME_FIELDS.END
            ] - max(val[self.RUNTIME_FIELDS.END] for val in cur_context[self.RUNTIME_FIELDS.SEQUENCER].values())

            cur_context_remaining_overhead = (
                cur_context[self.RUNTIME_FIELDS.OVERHEAD][self.RUNTIME_FIELDS.END]
                - cur_context[self.RUNTIME_FIELDS.OVERHEAD][self.RUNTIME_FIELDS.START]
            )

            total_bw_configs += cur_context[self.RUNTIME_FIELDS.BW][self.RUNTIME_FIELDS.CONFIGS_BW]
            total_bw_ddr += cur_context[self.RUNTIME_FIELDS.BW][self.RUNTIME_FIELDS.DDR_BW]
            total_bw_inter += cur_context[self.RUNTIME_FIELDS.BW][self.RUNTIME_FIELDS.INTER_CONTEXT_BW]
            total_bw_boundary += cur_context[self.RUNTIME_FIELDS.BW][self.RUNTIME_FIELDS.BOUNDARIES_BW]

            total_infer += cur_context_infer
            total_drain += cur_context_drain
            total_sequencers += cur_context_seq
            total_remain_configs += cur_context_remaining_configs
            total_remain_overhead += cur_context_remaining_overhead

        time_sum = total_infer + total_drain + total_sequencers + total_remain_configs + total_remain_overhead
        if time_sum == 0:
            time_sum = self.FIELD_NOT_AVAILABLE
            total_infer = self.FIELD_NOT_AVAILABLE
            total_drain = self.FIELD_NOT_AVAILABLE
            total_sequencers = self.FIELD_NOT_AVAILABLE
            total_remain_configs = self.FIELD_NOT_AVAILABLE
            total_remain_overhead = self.FIELD_NOT_AVAILABLE
        else:
            total_infer /= time_sum
            total_drain /= time_sum
            total_sequencers /= time_sum
            total_remain_configs /= time_sum
            total_remain_overhead /= time_sum

        bw_sum = total_bw_configs + total_bw_ddr + total_bw_inter + total_bw_boundary
        if bw_sum == 0:
            total_bw_configs = self.FIELD_NOT_AVAILABLE
            total_bw_ddr = self.FIELD_NOT_AVAILABLE
            total_bw_inter = self.FIELD_NOT_AVAILABLE
            total_bw_boundary = self.FIELD_NOT_AVAILABLE
        else:
            total_bw_configs /= bw_sum
            total_bw_ddr /= bw_sum
            total_bw_inter /= bw_sum
            total_bw_boundary /= bw_sum

        return {
            "infer": total_infer * 100,
            "drain": total_drain * 100,
            "sequencers": total_sequencers * 100,
            "configs": total_remain_configs * 100,
            "overhead": total_remain_overhead * 100,
            "bw_configs": total_bw_configs * 100,
            "bw_ddr": total_bw_ddr * 100,
            "bw_inter": total_bw_inter * 100,
            "bw_boundary": total_bw_boundary * 100,
        }

    def _normalize_times(self, network_group):
        FIELDS = self.RUNTIME_FIELDS
        # Shift all to start at zero
        min_time = next(iter(network_group[FIELDS.CONTEXTS][0][FIELDS.LAYERS].values()))[FIELDS.START]
        for context in network_group[FIELDS.CONTEXTS]:
            context[FIELDS.CONFIGURATIONS][FIELDS.START] -= min_time
            context[FIELDS.CONFIGURATIONS][FIELDS.END] -= min_time
            context[FIELDS.OVERHEAD][FIELDS.START] -= min_time
            if FIELDS.END in context[FIELDS.OVERHEAD]:
                context[FIELDS.OVERHEAD][FIELDS.END] -= min_time
            for seq in context[FIELDS.SEQUENCER].values():
                seq[FIELDS.START] -= min_time
                seq[FIELDS.END] -= min_time
            for layer in context[FIELDS.LAYERS].values():
                layer[FIELDS.START] -= min_time
                layer[FIELDS.END] -= min_time

    def _process_runtime_data(self):
        if self._runtime_data is None:
            return {}

        FIELDS = self.RUNTIME_FIELDS

        with open(self._runtime_data) as runtime_data_file:
            raw_runtime_data = json.load(runtime_data_file)

        version = raw_runtime_data["version"]
        if version == "1.0":
            all_batch_data = raw_runtime_data[FIELDS.NETWORK_GROUPS]
        elif version == "2.0":
            if len(raw_runtime_data[FIELDS.RUNS]) == 0:
                all_batch_data = raw_runtime_data[FIELDS.NETWORK_GROUPS][0]
            else:
                all_batch_data = raw_runtime_data[FIELDS.RUNS]
        else:
            raise ProfilingException(f"Unknown runtime data version {version}")

        self._nnm_clk_freq = raw_runtime_data["clock_cycle_MHz"]
        self._processed_runtime_data[FIELDS.RUNS] = []

        for raw_network_group in all_batch_data:
            if FIELDS.CONTEXT_NAME in raw_network_group[FIELDS.CONTEXTS][0]:
                preliminary_idx = None
                last_dynamic_idx = None
                for context_id in range(len(raw_network_group[FIELDS.CONTEXTS])):
                    if raw_network_group[FIELDS.CONTEXTS][context_id][FIELDS.CONTEXT_NAME] == "preliminary":
                        preliminary_idx = context_id
                    if "dynamic" in raw_network_group[FIELDS.CONTEXTS][context_id][FIELDS.CONTEXT_NAME]:
                        last_dynamic_idx = context_id

                assert (
                    preliminary_idx is not None and last_dynamic_idx is not None and last_dynamic_idx > preliminary_idx
                ), "Could not process runtime data json file"
                if last_dynamic_idx < len(raw_network_group[FIELDS.CONTEXTS]) - 1:
                    raw_network_group[FIELDS.CONTEXTS] = raw_network_group[FIELDS.CONTEXTS][
                        preliminary_idx : last_dynamic_idx + 1
                    ]
                else:
                    raw_network_group[FIELDS.CONTEXTS] = raw_network_group[FIELDS.CONTEXTS][preliminary_idx:]

            proto_network_group = self._hef_proto.network_groups[0]
            network_group, performance_details = self._copy_checkpoints(raw_network_group, proto_network_group)
            network_group_copy = copy.deepcopy(network_group)
            self._move_first_context_configs(network_group, network_group_copy)
            # Update the time for multi context
            if len(network_group[FIELDS.CONTEXTS]) > 1:
                self._normalize_times(network_group)
                # Set actual start time as the time of last context infer finishes
                for context_index in range(len(network_group[FIELDS.CONTEXTS])):
                    current_context = network_group[FIELDS.CONTEXTS][context_index]
                    actual_start_time = self._get_actual_start_time(
                        FIELDS,
                        context_index,
                        network_group,
                        current_context,
                    )
                    current_context[FIELDS.ACTUAL_START_TIME] = actual_start_time

            self._processed_runtime_data[FIELDS.RUNS].append(network_group)
            self._performance_from_runtime_data.append(performance_details)

        self._processed_runtime_data[FIELDS.CLOCK] = self._get_nnm_clk_freq() / (10**6)

        # take first batch as multi context fps
        self._fps_multi_context = self._performance_from_runtime_data[0][self.PERFORMANCE_DETAILS_FIELDS.FPS]

    @staticmethod
    def _get_actual_start_time(FIELDS, context_index, network_group, current_context):
        if context_index == 0:
            return next(iter(current_context[FIELDS.LAYERS].values()))[FIELDS.START]

        prev_context = context_index - 1
        while prev_context > 0 and not network_group[FIELDS.CONTEXTS][prev_context][FIELDS.LAYERS]:
            prev_context -= 1

        return list(network_group[FIELDS.CONTEXTS][prev_context][FIELDS.LAYERS].values())[-1][FIELDS.END]

    def _get_layer_name(self, layer):
        return layer.name if self._is_multi_scope else layer.name_without_scope
