#!/usr/bin/env python

import copy
import math

import numpy as np
import tensorflow as tf
from past.utils import old_div

from hailo_model_optimization.acceleras.model.preprocess.conversion import (
    nv12_to_yuv_conversion,
    rgbx_to_rgb_conversion,
    yuy2_to_yuv_conversion,
)
from hailo_model_optimization.acceleras.utils.acceleras_definitions import (
    BiasMode,
    ConcatAxis,
    EWMultType,
    OutputMinMaxStrategy,
    PostprocessTarget,
    PrecisionMode,
)
from hailo_model_optimization.acceleras.utils.layer_utils import reshape_input_by_windows, reshape_output_by_windows
from hailo_sdk_client.emulator import model
from hailo_sdk_client.emulator.BBOX_decoder import decode_branch
from hailo_sdk_client.numeric_translator.inter_layer_precision_mode import InterLayerPrecisionMode
from hailo_sdk_client.sdk_backend.sdk_backend_exceptions import BackendEmulatorException, BackendNotImplementedError
from hailo_sdk_client.tools.layers.layers_utils import calculate_padding, calculate_padding_per_dim
from hailo_sdk_common.export.hailo_graph_export import (
    ExportLevel,
    HailoGraphExport,
    OutputTensorsExport,
    VariableExportLevel,
    VariablesExport,
)
from hailo_sdk_common.hailo_nn.exceptions import HailoNNException
from hailo_sdk_common.hailo_nn.hn_definitions import (
    ActivationType,
    DepthToSpaceType,
    FeatureMultiplierType,
    FormatConversionType,
    LayerType,
    PaddingType,
    PrecisionSplitMode,
    ResizeBilinearPixelsMode,
    ResizeMethod,
    SpaceToDepthType,
)
from hailo_sdk_common.hailo_nn.hn_layers.layer import Layer
from hailo_sdk_common.logger.logger import default_logger
from hailo_sdk_common.numeric_utils.numeric_utils import get_deconv_stack_order, is_super_deconv
from hailo_sdk_common.paths_manager.SimWrapper import HSimWrapper
from hailo_sdk_common.targets.inference_targets import EmulationInferenceTargets, FineTuneParams, SdkMixedParams

ACT_MULT_BANKERS_ROUNDING = 6
APU_POST_SHIFT_BITS = 13

NMS_FIRST_OP = "nms_first_op"
NMS_LAST_OP = "nms_last_op"


def wrap_around(x, bits=16, name="op_wraparound"):
    """
    The function takes x, tensorflow.array and represent using wrap around given the
    number of bits of the accumulator
    Args:
        x - data to wrap around
        bits - number of bits for the int representation (default=16)
        name - name of the wrap around node for debug purposes
    output - the function returns the wrapped around values of x
    """

    def mod_float32(a, b):
        return a - b * tf.math.floor(tf.math.divide(a, b))

    offset = 2 ** (bits - 1)
    return tf.identity(mod_float32(x + offset, 2 * offset) - offset, name=name)


class MissingConstError(BackendEmulatorException):
    pass


class TFNode:
    def __init__(
        self,
        name,
        inp,
        op,
        b_op,
        add_op,
        scaled,
        act,
        out,
        k=None,
        b=None,
        out8_pre_round=None,
        pre_act=None,
        post_act=None,
        stats=None,
        activation_histogram=None,
    ):
        self.name = name
        self.inp = inp
        self.op = op
        self.b_op = b_op
        self.add_op = add_op
        self.scaled = scaled
        self.act = act
        self.out = out
        self.k = k
        self.b = b
        self.out8_pre_round = out8_pre_round
        self.pre_act = pre_act
        self.post_act = post_act
        self.stats = stats
        self.activation_histogram = activation_histogram


class TFStats:
    MANDATORY_ATTR_LIST = [
        "input_min",
        "input_max",
        "output_min",
        "output_max",
        "pre_act_min",
        "pre_act_max",
        "output_min_features",
        "output_max_features",
        "stats_min_pre_act_features",
        "stats_max_pre_act_features",
        "stats_energy_out_features",
        "stats_energy_in_features",
        "stats_energy_pre_features",
        "stats_non_zero_percent_features",
    ]

    OPTIONAL_ATTR_LIST = [
        "elementwise_min",
        "elementwise_max",
        "stats_min_elementwise_features",
        "stats_max_elementwise_features",
        "out_clipped_values_min",
        "out_clipped_values_max",
        "stats_min_weights_input",
        "stats_max_weights_input",
    ]

    def __init__(self, **kwargs):
        for attr in type(self).MANDATORY_ATTR_LIST:
            setattr(self, attr, kwargs.get(attr))
        for attr in type(self).OPTIONAL_ATTR_LIST:
            setattr(self, attr, kwargs.get(attr))


class TFModel(model.Model):
    def __init__(self, hailo_nn, target, optional_params):
        super().__init__()
        self._logger = default_logger()

        self.target = target
        self.use_simulator = True
        self.accumulator_size = 16 if self.target != EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC else 32

        self.hailo_nn = hailo_nn
        self.model_name = self.hailo_nn.name
        self.executable_model_suffix = optional_params.model_name if optional_params.model_name else ""
        self.recipe = list(hailo_nn.stable_toposort(key="name"))
        self.nodes = {}
        self.conv_layers_inference = {}
        self.translate_input = optional_params.translate_input
        self.consts = optional_params.consts if optional_params.consts else {}
        self.translated_consts = optional_params.translated_consts if optional_params.translated_consts else {}
        self.activation_callback = optional_params.activation_callback
        self.fine_tune_params = (
            optional_params.fine_tune_params if optional_params.fine_tune_params else FineTuneParams()
        )
        self.mixed_params = optional_params.mixed_params if optional_params.mixed_params else SdkMixedParams()
        self.is_mercury_arch = optional_params.is_mercury_arch
        self.is_pluto_arch = optional_params.is_pluto_arch
        self.force_weightless_model = optional_params.force_weightless_model
        self.run_numeric_in_int32 = optional_params.run_numeric_in_int32
        self.activation_points = optional_params.activation_points
        self.twin_mode = optional_params.twin_mode
        self._bias_variables = {}
        self._bias_delta_variables = {}
        self._kernel_variables = {}
        self._kernel_delta_variables = {}
        self._variables_to_initialize = {}
        self._activation_histograms = {}
        self._input_nodes = {}
        self._enable_clipping = optional_params.enable_clipping
        self._reused_variables = {}
        self.local_consts = {}

        self._ft_kernel_range_tensors = {}
        self._ft_alpha_tensors = {}
        self._ft_final_kernel_tensors = {}
        self._ft_kernel_frac_part_tensors = {}

        hsim_wrapper = HSimWrapper()
        hsim_wrapper.load()
        self.hsim = hsim_wrapper.hsim

        self.emulation_mult_shift = None
        self.emulation_shift = None

        self.current_scope = None
        self._load_params_func = None

        self._custom_inputs = optional_params.custom_inputs

        if self._custom_inputs:
            self.g = next(iter(self._custom_inputs.values())).graph
        elif optional_params.custom_graph:
            self.g = optional_params.custom_graph
        else:
            self.g = tf.Graph()

        # self.g = optional_params.custom_graph if optional_params.custom_graph else tf.compat.v1.get_default_graph()

        self.s = (
            optional_params.custom_session if optional_params.custom_session else tf.compat.v1.Session(graph=self.g)
        )
        assert self.s.graph is self.g, f"Session {self.s} does not belong to graph {self.g}"

        # mixed mode stuff
        self.mixed_native_layers = []
        self.mixed_numeric_layers = []
        is_mixed = False

        if optional_params.native_layers is not None and self.target in [
            EmulationInferenceTargets.SDK_MIXED,
            EmulationInferenceTargets.SDK_FINE_TUNE,
        ]:
            self._validate_native_layers_in_graph(optional_params.native_layers)
            self.mixed_native_layers = optional_params.native_layers

        if self.target == EmulationInferenceTargets.SDK_MIXED:
            self.target = EmulationInferenceTargets.SDK_NATIVE
            for layer in self.hailo_nn.stable_toposort():
                if layer.name not in self.mixed_native_layers:
                    self.mixed_numeric_layers.append(layer)
            is_mixed = True

        # This function must be called from the __init__ function,
        # to append the nodes to the graph and set the variable space
        self._build_graph(is_mixed)

        if is_mixed:
            self.target = EmulationInferenceTargets.SDK_MIXED

    def _validate_native_layers_in_graph(self, native_layers):
        for layer in native_layers:
            try:
                self.hailo_nn.get_layer_by_name(layer)
            except HailoNNException:
                raise BackendEmulatorException(
                    f"native_layers for SDK_MIXED contains a layer ({layer}) that doesn't exists in" " the hn",
                )

    def _get_accumulator_size(self, inter_layer_precision_mode, is_16bit_precision_mode=False):
        accumulator_size = inter_layer_precision_mode.accumulator_size
        if self.target == EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC and not is_16bit_precision_mode:
            accumulator_size = accumulator_size * 2
        return accumulator_size

    def _add_variables_to_initialize(self, layer_name, variables):
        if layer_name not in self._variables_to_initialize:
            self._variables_to_initialize[layer_name] = []
        self._variables_to_initialize[layer_name].extend(variables)

    def _get_param_key(self, param):
        if not self.twin_mode:
            return param

        param_parts = param.split("/", 1)
        scope = f"{param_parts[0]}_{self.executable_model_suffix}"
        return f"{scope}/{param_parts[-1]}"

    def export_description(self):
        return self.g.as_graph_def()

    def set_load_params_func(self, load_params_func):
        self._load_params_func = load_params_func

    def load_params(self):
        session, graph, load_params_func = self.s, self.g, self._load_params_func
        params = load_params_func()

        if params is None:
            return

        with session.as_default(), graph.as_default():
            assign_ops = []
            for name, value in params.params.items():
                if name == "params_kind":
                    continue
                if (
                    name.endswith(
                        (
                            "output_stage/piecewise/x_points:0",
                            "output_stage/piecewise/slopes:0",
                            "output_stage/piecewise/offsets:0",
                            "output_stage/piecewise/slopes_m:0",
                            "output_stage/piecewise/slopes_e:0",
                        ),
                    )
                    and len(value.shape) == 1
                ):
                    # For backwards compatibility
                    value = np.expand_dims(value, 0)
                var = tf.compat.v1.global_variables(name)
                if len(var) > 1:
                    raise ValueError("Multiple tensors with the same name")
                if len(var) == 0:
                    continue
                var = var[0]
                assign_ops.append(var.assign(value))
            session.run(assign_ops)

    def export_params(self):
        raise BackendNotImplementedError("Unable yet to export params")

    def get_input_layer(self):
        if len(self._input_nodes) > 1:
            raise BackendEmulatorException("Unable to get input layer of model with multiple inputs")
        return self.nodes[self.recipe[0]].inp

    def get_input_layers(self):
        return self._input_nodes

    def get_tensors_layers_names(self, layers, real_outputs=False):
        names = []
        for layer in layers:
            if layer.engine == PostprocessTarget.NN_CORE:
                successors = list(self.hailo_nn.successors(layer))
                if len(self.nodes[layer].out) == 1 and len(successors) > 1:
                    # All successros get the same input
                    names.append(layer.name)
                    continue

            if real_outputs:
                for _, succ in zip(self.nodes[layer].out, successors):
                    cpu_succs = [
                        next_succ
                        for next_succ in self.hailo_nn.successors(succ)
                        if next_succ.engine == PostprocessTarget.CPU
                    ]
                    last_core_layer = len(cpu_succs) > 0
                    if (
                        succ.op
                        in [
                            LayerType.output_layer,
                            LayerType.output_mux,
                            LayerType.pp_output_layer,
                            LayerType.external_output_layer,
                        ]
                        or not real_outputs
                        or last_core_layer
                    ):
                        names.append(layer.name)
                    continue
            elif layer.engine == PostprocessTarget.NN_CORE:
                for _ in self.nodes[layer].out:
                    names.append(layer.name)

        return names

    def get_tensors(self, layers, only_output_tensors=False):
        outputs = []
        for layer in layers:
            if layer.engine == PostprocessTarget.NN_CORE:
                successors = list(self.hailo_nn.successors(layer))

                if len(self.nodes[layer].out) == 1 and len(successors) > 1:
                    # All successros get the same input
                    outputs.append(self.nodes[layer].out[0])
                    continue

                for tnsr, succ in zip(self.nodes[layer].out, successors):
                    if (
                        succ.op
                        in [
                            LayerType.output_layer,
                            LayerType.output_mux,
                            LayerType.pp_output_layer,
                            LayerType.external_output_layer,
                        ]
                        or not only_output_tensors
                        or succ.engine == PostprocessTarget.CPU
                    ):
                        outputs.append(tnsr)

        return outputs

    def get_real_output_layers(self):
        hn_real_output_layers = self.hailo_nn.get_real_output_layers()
        for node in self.nodes:
            # Add post-process layers
            core_succs = [succ for succ in self.hailo_nn.successors(node) if succ.engine == PostprocessTarget.NN_CORE]
            if len(core_succs) == 0 and node.op not in [
                LayerType.output_layer,
                LayerType.output_mux,
                LayerType.pp_output_layer,
                LayerType.external_output_layer,
            ]:
                hn_real_output_layers.append(node)
        return hn_real_output_layers

    def get_output_layers(self):
        return self.get_tensors(self.get_real_output_layers(), only_output_tensors=True)

    def get_output_layers_original_names(self):
        return [layer.original_names for layer in self.get_real_output_layers()]

    def get_output_layers_names(self):
        return self.get_tensors_layers_names(self.get_real_output_layers())

    def get_sorted_inner_layers(self, include_outputs=False):
        return [
            node
            for node in self.recipe
            if (
                (node.op != LayerType.output_layer or include_outputs)
                and (node.op != LayerType.output_mux)
                and (node.engine == PostprocessTarget.NN_CORE)
            )
        ]

    def get_inner_layers_output_tensors(self):
        return self.get_tensors(self.get_sorted_inner_layers())

    def get_inner_layers_names(self):
        return self.get_tensors_layers_names(self.get_sorted_inner_layers())

    def prepare_stats_export(self):
        output = []
        layers_names = []
        for layer in self.get_sorted_inner_layers():
            if layer.engine == PostprocessTarget.NN_CORE:
                node = self.nodes[layer]
                layer_name = node.name
                count_optional = 0
                if node.stats is not None:
                    for attr in TFStats.MANDATORY_ATTR_LIST:
                        output += [getattr(node.stats, attr)]
                    layers_names += [layer_name] * len(TFStats.MANDATORY_ATTR_LIST)

                    for attr in TFStats.OPTIONAL_ATTR_LIST:
                        statval = getattr(node.stats, attr)
                        if statval is not None:
                            count_optional += 1
                            output += [statval]
                    layers_names += [layer_name] * count_optional

        return OutputTensorsExport(
            export_level=ExportLevel.CALIBRATION_STATS,
            tensors=output,
            layers_names=layers_names,
        )

    def prepare_ft_output_export(self):
        ft_train_outputs = self.get_ft_train_output_layers()
        ft_train_outputs_names = self.get_tensors_layers_names(ft_train_outputs, real_outputs=False)
        ft_train_outputs_tensors = self.get_tensors(ft_train_outputs, False)

        return OutputTensorsExport(
            export_level=ExportLevel.FT_TRAIN_OUTPUTS,
            tensors=ft_train_outputs_tensors,
            layers_names=ft_train_outputs_names,
        )

    def get_ft_train_output_layers(self):
        """
        Find edge nodes for finetune.
        Iterate through the graph (starting from the input nodes),
        stop at nodes that are considered unsupported by fineune, and return the layers right before those nodes.
        Unsupported nodes are layers we don't want to backpropagte through them (non-derivativable nodes or other)
        """
        input_layers = self.hailo_nn.get_input_layers()
        next_layers = input_layers
        handled_layers = set()
        ft_train_outputs = []
        while next_layers:
            current_layers = next_layers
            next_layers = []
            for layer in current_layers:
                if layer in handled_layers:
                    continue
                handled_layers.add(layer)
                successors = list(self.hailo_nn.successors(layer))
                is_ft_edge = not all(succ.finetune_supported for succ in successors)
                if is_ft_edge:
                    ft_train_outputs.append(layer)
                else:
                    next_layers.extend(successors)
        return ft_train_outputs

    def prepare_bias_export(self):
        layers_names = list(self._bias_variables.keys())
        variables = list(self._bias_variables.values())
        return VariablesExport(export_level=VariableExportLevel.BIASES, variables=variables, layers_names=layers_names)

    def prepare_bias_delta_export(self):
        layers_names = list(self._bias_delta_variables.keys())
        variables = list(self._bias_delta_variables.values())
        return VariablesExport(
            export_level=VariableExportLevel.BIASES_DELTA,
            variables=variables,
            layers_names=layers_names,
        )

    def prepare_kernel_variables_export(self):
        layers_names = list(self._kernel_variables.keys())
        variables = list(self._kernel_variables.values())
        return VariablesExport(export_level=VariableExportLevel.KERNELS, variables=variables, layers_names=layers_names)

    def prepare_kernel_delta_variables_export(self):
        layers_names = list(self._kernel_delta_variables.keys())
        variables = list(self._kernel_delta_variables.values())
        return VariablesExport(
            export_level=VariableExportLevel.KERNELS_DELTA,
            variables=variables,
            layers_names=layers_names,
        )

    def prepare_ft_kernel_range_tensors_export(self):
        layers_names = list(self._ft_kernel_range_tensors.keys())
        tensors = list(self._ft_kernel_range_tensors.values())
        return OutputTensorsExport(export_level=ExportLevel.FT_KERNEL_RANGE, tensors=tensors, layers_names=layers_names)

    def prepare_ft_alpha_tensors_export(self):
        layers_names = list(self._ft_alpha_tensors.keys())
        tensors = list(self._ft_alpha_tensors.values())
        return OutputTensorsExport(export_level=ExportLevel.FT_ALPHA, tensors=tensors, layers_names=layers_names)

    def prepare_ft_final_kernel_tensors_export(self):
        layers_names = list(self._ft_final_kernel_tensors.keys())
        tensors = list(self._ft_final_kernel_tensors.values())
        return OutputTensorsExport(export_level=ExportLevel.FT_FINAL_KERNEL, tensors=tensors, layers_names=layers_names)

    def prepare_ft_kernel_frac_part_tensors_export(self):
        layers_names = list(self._ft_kernel_frac_part_tensors.keys())
        tensors = list(self._ft_kernel_frac_part_tensors.values())
        return OutputTensorsExport(
            export_level=ExportLevel.FT_KERNEL_FRAC_PART,
            tensors=tensors,
            layers_names=layers_names,
        )

    def prepare_variables_to_initialize_export(self):
        layers_names = []
        variables = []
        for layer_name, var_list in self._variables_to_initialize.items():
            for var in var_list:
                layers_names.append(layer_name)
                variables.append(var)
        return VariablesExport(
            export_level=VariableExportLevel.UNSET_VARIABLES,
            variables=variables,
            layers_names=layers_names,
        )

    def prepare_activations_histograms_export(self):
        layers_names = list(self._activation_histograms.keys())
        histograms_tensors = list(self._activation_histograms.values())
        return OutputTensorsExport(
            export_level=ExportLevel.ACTIVATIONS_HISTOGRAMS,
            tensors=histograms_tensors,
            layers_names=layers_names,
        )

    def node_has_multi_out_tensors(self, node):
        # used for feature_splitter
        return len(node.out) > 1

    def prepare_full_graph_export(self):
        output = {}
        output_pre_act = {}
        for layer in self.get_sorted_inner_layers(include_outputs=True):
            if layer.engine == PostprocessTarget.NN_CORE:
                node = self.nodes[layer]
                output[node.name] = []

                if (node.inp is not None) and (node.inp.name not in output):
                    output[node.name].append(node.inp)
                if (node.out is not None) and (node.out[0].name not in output):
                    # Its ok to take only first output beacause name is the same
                    for node_output in node.out:
                        output[node.name].append(node_output)
                if (node.pre_act is not None) and (node.pre_act.name not in output):
                    if node.name not in output_pre_act:
                        output_pre_act[node.name] = []
                    output_pre_act[node.name].append(node.pre_act)
                    output[node.name].append(node.pre_act)

                if (node.act is not None) and (node.act.name not in output):
                    output[node.name].append(node.act)
                if (node.scaled is not None) and (node.scaled.name not in output):
                    output[node.name].append(node.scaled)
                if (node.b_op is not None) and (node.b_op.name not in output):
                    output[node.name].append(node.b_op)
                if (node.op is not None) and (node.op.name not in output):
                    output[node.name].append(node.op)
                if (node.add_op is not None) and (node.add_op.name not in output):
                    output[node.name].append(node.add_op)
                if (node.k is not None) and (node.k.name not in output):
                    output[node.name].append(node.k)
                if (node.out8_pre_round is not None) and (node.out8_pre_round.name not in output):
                    output[node.name].append(node.out8_pre_round)
                if (node.post_act is not None) and (node.post_act.name not in output):
                    output[node.name].append(node.post_act)

        full_tensors = []
        layers_names = []
        for name, tensors_list in output.items():
            full_tensors.extend(tensors_list)
            layers_names.extend([name] * len(tensors_list))
        full_export = OutputTensorsExport(
            export_level=ExportLevel.ALL_LAYERS_ALL_OPS,
            tensors=full_tensors,
            layers_names=layers_names,
        )

        pre_act_tensors = []
        pre_act_layers_names = []
        for name, tensors_list in output_pre_act.items():
            pre_act_tensors.extend(tensors_list)
            pre_act_layers_names.extend([name] * len(tensors_list))

        pre_act_export = OutputTensorsExport(
            export_level=ExportLevel.ALL_LAYERS_PRE_ACT_OPS,
            tensors=pre_act_tensors,
            layers_names=pre_act_layers_names,
        )

        return full_export, pre_act_export

    def get_finalized_conv_layers_inference(self, res_by_layer):
        elementwise_stats_name_value = [
            ("stats_min_elementwise_name", "stats_min_elementwise_value"),
            ("stats_max_elementwise_name", "stats_max_elementwise_value"),
            ("stats_min_elementwise_features_name", "stats_min_elementwise_features_value"),
            ("stats_max_elementwise_features_name", "stats_max_elementwise_features_value"),
        ]

        stats_name_value = [
            ("stats_min_input_name", "stats_min_input_value"),
            ("stats_max_input_name", "stats_max_input_value"),
            ("stats_min_output_name", "stats_min_output_value"),
            ("stats_max_output_name", "stats_max_output_value"),
            ("stats_min_pre_act_name", "stats_min_pre_act_value"),
            ("stats_max_pre_act_name", "stats_max_pre_act_value"),
            ("stats_min_output_features_name", "stats_min_output_features_value"),
            ("stats_max_output_features_name", "stats_max_output_features_value"),
            ("stats_min_pre_act_features_name", "stats_min_pre_act_features_value"),
            ("stats_max_pre_act_features_name", "stats_max_pre_act_features_value"),
            ("stats_output_energy_features_name", "stats_output_energy_features_value"),
            ("stats_input_energy_features_name", "stats_input_energy_features_value"),
            ("stats_pre_energy_features_name", "stats_pre_energy_features_value"),
            ("stats_non_zero_percent_features_name", "stats_non_zero_percent_features_value"),
        ]

        weights_input_name_value = [
            ("stats_min_weights_input_name", "stats_min_weights_input_value"),
            ("stats_max_weights_input_name", "stats_max_weights_input_value"),
        ]

        for name in self.conv_layers_inference:
            # Ignoring dummy-conv(in/out layers) because we can't collect stats on them.
            if "dummy_conv" in self.conv_layers_inference[name]:
                continue

            if "elementwise_name" in self.conv_layers_inference[name]:
                for stat_name, stat_value in elementwise_stats_name_value:
                    self.conv_layers_inference[name][stat_value] = res_by_layer[
                        self.conv_layers_inference[name][stat_name]
                    ]

            if "weights_input_name" in self.conv_layers_inference[name]:
                for stat_name, stat_value in weights_input_name_value:
                    self.conv_layers_inference[name][stat_value] = res_by_layer[
                        self.conv_layers_inference[name][stat_name]
                    ]

            for stat_name, stat_value in stats_name_value:
                self.conv_layers_inference[name][stat_value] = res_by_layer[self.conv_layers_inference[name][stat_name]]

            self.conv_layers_inference[name]["limvals_output_forced"] = res_by_layer.get(
                f"{name}/limvals_out_forced:0",
                False,
            )
            self.conv_layers_inference[name]["limvals_input_forced"] = res_by_layer.get(
                f"{name}/limvals_inp_forced:0",
                False,
            )

        return res_by_layer, self.conv_layers_inference

    def _build_graph(self, is_mixed=False):
        with self.g.as_default():
            self.model_variable_scopes = [None] * len(self.hailo_nn.net_params.net_scopes)
            for scope_index, scope in enumerate(self.hailo_nn.net_params.net_scopes):
                if is_mixed:
                    self.target = EmulationInferenceTargets.SDK_FP_OPTIMIZED
                variable_scope = scope + "_" + self.executable_model_suffix if self.executable_model_suffix else scope
                with tf.compat.v1.variable_scope(variable_scope) as self.model_variable_scopes[scope_index]:
                    with tf.compat.v1.variable_scope("General"):
                        self.is_training = tf.keras.Input(dtype=tf.bool, name="is_training", shape=())
                    build_graph_order = self.recipe
                    # make sure that the first layer is a real input layer
                    first_input_layer = next(
                        (i, v) for i, v in enumerate(build_graph_order) if v.op == LayerType.input_layer
                    )
                    if first_input_layer[0] != 0:
                        build_graph_order.remove(first_input_layer[1])
                        build_graph_order.insert(0, first_input_layer[1])
                    for node in self.recipe:
                        if self._is_node_in_scope(node, scope) and node.engine == PostprocessTarget.NN_CORE:
                            self._build_graph_node(node)

    def _variable_scope_without_executable_model_suffix(self, variable_scope):
        if self.executable_model_suffix:
            return variable_scope.replace(f"_{self.executable_model_suffix}", "")
        else:
            return variable_scope

    def _get_scope_from_the_name_of_tensor_or_layer(self, layer):
        """
        return the scope of a layer or tensor

        Args:
            layer: the layer or tensor as written in the tensor/layer_hn

        Returns: the scope the layer is in

        """
        scope = self._variable_scope_without_executable_model_suffix(layer.name.split("/")[0])
        index = self.hailo_nn.net_params.net_scopes.index(scope)
        return self.model_variable_scopes[index]

    @staticmethod
    def _is_node_in_scope(node, scope):
        return scope == node.scope

    def _handle_output_mux(self, hn_node, l_desc):
        # this function is needed so that the emulator will not fail
        if hn_node.op == LayerType.output_mux:
            hn_node = self._find_first_non_mux_layer(l_desc)
        return hn_node

    def _find_first_non_mux_layer(self, l_desc):
        layer_inputs = list(self.hailo_nn.predecessors(l_desc))
        for layer in layer_inputs:
            if layer.op != LayerType.output_mux:
                return layer
        return self._find_first_non_mux_layer(layer_inputs[0])

    def _set_emulation_mult_shift(self, l_desc, is_16bit_precision_mode):
        if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
            self.emulation_mult_shift = tf.cast(self.emulation_shift, dtype=tf.int8)
        else:
            self.emulation_mult_shift = tf.cast(2**self.emulation_shift, dtype=tf.int8)

    def _is_16bit_input(self, l_desc):
        return (
            False
            if l_desc.precision_config.precision_mode is None
            else l_desc.precision_config.precision_mode.input_precision_mode() == PrecisionMode.a16_w16_a16
        )

    def _is_16bit_weights(self, l_desc):
        return (
            False
            if l_desc.precision_config.precision_mode is None
            else l_desc.precision_config.precision_mode.weight_bits() == 16
        )

    def _is_zp_required(self, is_16bit_precision_mode):
        return is_16bit_precision_mode and not self.is_pluto_arch

    def get_layer_input_tensors(self, l_desc, layer_inputs_to_iterate):
        layer_inputs = []
        for hn_node in layer_inputs_to_iterate:
            hn_node = self._handle_output_mux(hn_node, l_desc)
            output_tensors = self.nodes[hn_node].out
            if self.node_has_multi_out_tensors(self.nodes[hn_node]):
                tnsr = output_tensors[hn_node.outputs.index(l_desc.name)]
            else:
                tnsr = output_tensors[0]
            layer_inputs.append(tnsr)
        return layer_inputs

    def get_number_of_expected_ops(self, l_desc):
        if self.is_pluto_arch and self._is_16bit_input(l_desc):
            if self._is_16bit_weights(l_desc):
                return 4  # [inp, weigts] : [L,L], [H,L], [L,H], [H,H]
            else:
                return 2  # [inp, weigts] : [L,L], [H,L]
        return 1

    def split_inputs_for_16bit_precision(self, l_desc, inputs):
        low_inputs = []
        high_inputs = []
        if self.is_pluto_arch and self._is_16bit_input(l_desc):
            # create two lists of inputs, one for low pixels and one for high pixels
            for inp in inputs:
                low_inputs.append(tf.cast(tf.bitwise.bitwise_and(inp, 0xFF), dtype=tf.float32))
                high_inputs.append(
                    tf.cast(tf.bitwise.bitwise_and(tf.bitwise.right_shift(inp, 8), 0xFF), dtype=tf.float32)
                )
        else:
            return inputs
        return low_inputs, high_inputs

    def _build_3d_conv(self, l_desc, inputs, inter_layer_precision_mode):
        input_features = l_desc.input_features // l_desc.input_disparity
        kernel_disparity = l_desc.kernel_shape[2] // input_features
        stride = l_desc.strides[-1]
        two_d_outputs = []
        zp_in = tf.reshape(
            tf.Variable(
                tf.random.normal([l_desc.output_features], stddev=0.35),
                name="zero_point_in",
                shape=tf.TensorShape(None),
            ),
            [-1],
        )[0]
        pad_val = self._get_const("padding_const_value", zp_in)
        k_vars = self._build_kernel(
            tf.Variable(tf.random.normal(l_desc.kernel_shape, stddev=0.35), name="kernel"),
            l_desc,
        )
        num_padding_start, num_padding_end = calculate_padding_per_dim(
            l_desc.padding,
            kernel_disparity,
            stride // input_features,
            l_desc.input_disparity,
            1,
        )
        if l_desc.padding == PaddingType.valid:
            end_point = l_desc.input_features - l_desc.kernel_shape[2] + input_features
        else:
            end_point = l_desc.input_features
        input_3d = inputs[0]
        input_3d = tf.add(input_3d, -pad_val)
        padding_tensor_start = tf.zeros(
            (tf.shape(inputs[0])[0], l_desc.input_shape[1], l_desc.input_shape[2], num_padding_start * input_features),
            dtype=tf.float32,
        )
        padding_tensor_end = tf.zeros(
            (tf.shape(inputs[0])[0], l_desc.input_shape[1], l_desc.input_shape[2], num_padding_end * input_features),
            dtype=tf.float32,
        )
        input_3d = tf.concat([tf.concat([padding_tensor_start, input_3d], axis=-1), padding_tensor_end], axis=-1)
        input_3d = tf.add(input_3d, pad_val)
        for i in range(0, end_point, stride):
            input = input_3d[:, :, :, i : i + kernel_disparity * input_features]
            op, k = self._build_conv(
                l_desc, [input], inter_layer_precision_mode, elementwise=None, k_vars=k_vars, pad_val=pad_val
            )
            two_d_outputs.append(op)
        final_output = two_d_outputs[0]
        for i in range(1, len(two_d_outputs)):
            final_output = tf.concat([final_output, two_d_outputs[i]], axis=-1)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [op])
        return final_output, k

    def _build_graph_node(self, l_desc):
        tf.random.set_seed(1)
        if l_desc.op in (LayerType.input_layer, LayerType.const_input):
            return self._build_input_layer(l_desc)
        layer_inputs = list(self.hailo_nn.predecessors(l_desc))
        inputs = self.get_layer_input_tensors(l_desc, layer_inputs)

        with tf.compat.v1.variable_scope(l_desc.name_without_scope) as self.current_scope:
            layer_name = l_desc.name_without_scope
            scope = self._get_scope_from_the_name_of_tensor_or_layer(l_desc)
            qp_in = self._get_variable_with_scope(variable_name="qp_in", layer_name=layer_name, scope=scope)

            if not l_desc.in_emulation_graph:
                raise BackendEmulatorException(
                    f"in_emulation_graph=False for {l_desc.name} which is not supported with legacy emulation, "
                    f"please use Acceleras for emulation",
                )
            if l_desc in self.mixed_numeric_layers and l_desc.op != LayerType.concat:
                inputs = self._handle_layers_with_sdk_mixed(inputs, l_desc, qp_in)
            inp = inputs[0]
            if l_desc.op == LayerType.output_layer:
                return self._build_output_layer(l_desc, inp)
            if l_desc.op == LayerType.output_mux:
                return None
            if l_desc.op == LayerType.concat:
                return self._build_concat_layer(l_desc, inputs)
            if l_desc.op == LayerType.maxpool:
                return self._build_max_pool(l_desc, inp)
            if l_desc.op == LayerType.resize:
                is_resize_bilinear_quantized = not l_desc.is_bilinear_align_corners_not_quantized
                non_quantized_bilinear = (
                    self.activation_points is None or self.current_scope.name not in self.activation_points
                ) and self.target in [
                    EmulationInferenceTargets.SDK_NUMERIC,
                    EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                ]
                if (
                    l_desc._method != ResizeMethod.bilinear
                    or not is_resize_bilinear_quantized
                    or non_quantized_bilinear
                ):
                    return self._build_resize(l_desc, inp)
            if l_desc.op == LayerType.nms:
                return self._build_nms(l_desc, inp, qp_in)
            if l_desc.op in [LayerType.shortcut, LayerType.portal]:
                return self._build_shortcut(l_desc, inp)
            if l_desc.op == LayerType.external_pad:
                return self._build_pad(l_desc, inp, qp_in)
            if l_desc.op == LayerType.format_conversion:
                return self._build_format_conversion(l_desc, inp)
            if l_desc.op in [LayerType.feature_interleave]:
                return self._build_feature_interleave(
                    l_desc,
                    inp,
                    l_desc.output_shape,
                    l_desc.padding == PaddingType.deconv,
                )
            if l_desc.op == LayerType.depth_to_space:
                return self._build_depth_to_space(l_desc, inp)
            if l_desc.op == LayerType.space_to_depth:
                return self._build_space_to_depth(l_desc, inp)
            if l_desc.op == LayerType.slice:
                return self._build_slice(l_desc, inp)
            if l_desc.op == LayerType.argmax:
                return self._build_argmax(l_desc, inp)
            if l_desc.op == LayerType.reduce_max:
                return self._build_reduce_max(l_desc, inp)
            if l_desc.op == LayerType.feature_shuffle:
                return self._build_feature_shuffle(l_desc, inp)
            if l_desc.op == LayerType.softmax:
                return self._build_softmax(l_desc, inp, qp_in)
            if l_desc.op == LayerType.feature_splitter:
                return self._build_feature_splitter(l_desc, inp)
            if l_desc.op == LayerType.row_splitter:
                return self._build_row_splitter(l_desc, inp)
            if l_desc.op == LayerType.width_splitter:
                return self._build_width_splitter(l_desc, inp)
            if l_desc.op == LayerType.precision_splitter:
                return self._build_precision_splitter(l_desc, inp)
            if l_desc.op == LayerType.layer_normalization:
                return self._build_layernorm(l_desc, inp, qp_in)
            if l_desc.op in [LayerType.postprocess]:
                raise BackendEmulatorException(f"Currently emulator does not support {l_desc.op.value} layers")

            total_for_act = []  # collect all parts of the tensor after op for activation
            total_ops = self.get_number_of_expected_ops(l_desc)
            use_split_inputs = total_ops > 1 and self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            ]
            if use_split_inputs:
                low_inputs, high_inputs = self.split_inputs_for_16bit_precision(l_desc, inputs)
            for op_idx in range(total_ops):
                if use_split_inputs:
                    inputs = high_inputs if op_idx % 2 else low_inputs
                    inp = inputs[0]
                inter_layer_precision_mode = InterLayerPrecisionMode.from_hailo_nn(self.hailo_nn, l_desc.name)
                self.conv_layers_inference[self._get_param_key(l_desc.name)] = {}
                layer_inference = self.conv_layers_inference[self._get_param_key(l_desc.name)]
                layer_inference["input_name"] = inp.name
                layer_inference["input_tensor"] = inp
                layer_inference["dynamic_weights"] = l_desc.dynamic_weights
                if l_desc.dynamic_weights:
                    weights_index = 0 if l_desc.op == LayerType.feature_multiplier else 1
                    layer_inference["weights_input_name"] = inputs[weights_index].name
                    layer_inference["weights_input_tensor"] = inputs[weights_index]

                if l_desc.op == LayerType.avgpool:
                    layer_inference["avgpool_kernel_shape"] = [
                        l_desc.kernel_shape[1],
                        l_desc.kernel_shape[2],
                        l_desc.input_shape[3],
                    ]

                if l_desc.op == LayerType.reduce_sum:
                    layer_inference["reducesum_kernel_shape"] = [
                        l_desc.kernel_shape[1],
                        l_desc.kernel_shape[2],
                        l_desc.kernel_shape[3],
                    ]
                if l_desc.op == LayerType.reduce_mean:
                    layer_inference["reducemean_kernel_shape"] = [
                        l_desc.kernel_shape[1],
                        l_desc.kernel_shape[2],
                        l_desc.kernel_shape[3],
                    ]

                if l_desc.op == LayerType.resize and is_resize_bilinear_quantized:
                    layer_inference["resize_bilinear_pixel_mode"] = l_desc.resize_bilinear_pixels_mode.value

                # store all information in the layer_inference
                layer_inference["activation_type"] = l_desc.activation
                layer_inference["layer_type"] = l_desc.op

                layer_inference["use_4bit_weights"] = (
                    False
                    if l_desc.precision_config.precision_mode is None
                    else l_desc.precision_config.precision_mode.reduce() == PrecisionMode.a8_w4
                )
                layer_inference["exponential_mode_4bit_weights"] = (
                    l_desc.precision_config.precision_mode == PrecisionMode.a8_w4_exp
                )
                layer_inference["bias_mode"] = l_desc.precision_config.bias_mode

                # Used in legacy quantization...
                layer_inference["max_elementwise_feed_repeat"] = l_desc.translation_config.max_elementwise_feed_repeat
                layer_inference["max_bias_feed_repeat"] = l_desc.translation_config.max_bias_feed_repeat
                layer_inference["activation_fit"] = l_desc.translation_config.activation_fit

                if (
                    l_desc.op == LayerType.conv
                    # temporarily ignore decompose weights for pluto until we figure it out
                    and (l_desc.decompose_weights and not self.is_pluto_arch)
                    and self.target
                    in [
                        EmulationInferenceTargets.SDK_NUMERIC,
                        EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                        EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
                    ]
                ):
                    op = self._build_conv_decompose(l_desc, inputs, inter_layer_precision_mode, None)
                    self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [op])
                    return op
                mult_shift = self._get_variable_with_reuse(
                    "mult_shift",
                    initial_value=0,
                    dtype=tf.int8,
                    trainable=False,
                    name="output_stage/mult_shift",
                    shape=tf.TensorShape(None),
                )
                mult_shift = tf.cast(mult_shift, dtype=tf.float32)
                self.emulation_shift = mult_shift[op_idx] if use_split_inputs else mult_shift

                # mult_shift might be a vector or scalar
                self.emulation_shift = tf.reshape(self.emulation_shift, [-1])
                is_16bit_precision_mode = self._is_16bit_input(l_desc)
                self._set_emulation_mult_shift(l_desc, is_16bit_precision_mode)
                k = None
                if self.target == EmulationInferenceTargets.SDK_FINE_TUNE and self.mixed_native_layers == []:
                    # ..this is just a precaution in case prev output wasn't quantized somehow... mostly shouldn't matter.
                    #     in the special "mixed" case, it would mess with leaving prev' layer in native, so we avoid this.
                    input_activation_bits = inter_layer_precision_mode.input_activation_bits
                    limvals_in_min, limvals_in_max = self._get_const("limvals_in", translated=True)
                    if (limvals_in_min < 0 and limvals_in_max < 0) or (limvals_in_min > 0 and limvals_in_max > 0):
                        raise BackendEmulatorException(
                            "Can't fake quant properly because TF got undefined behavior "
                            "when doing fake-quant with same-sign limits",
                        )  # TODO work around this
                    inp = tf.quantization.fake_quant_with_min_max_args(
                        inp,
                        min=limvals_in_min,
                        max=limvals_in_max,
                        num_bits=input_activation_bits,
                        name=f"{l_desc.name}_fine_tune_quant_inp",
                    )

                if l_desc.op in [LayerType.conv, LayerType.dw, LayerType.avgpool, LayerType.deconv]:
                    weight_op_idx = op_idx // 2 if use_split_inputs else None
                    if l_desc.op == LayerType.conv and (
                        l_desc.input_disparity > 1 or l_desc.kernel_shape[2] // l_desc.input_features > 1
                    ):
                        op, k = self._build_3d_conv(l_desc, inputs, inter_layer_precision_mode)
                    elif l_desc.ew_add_enabled and self.target in [
                        EmulationInferenceTargets.SDK_NUMERIC,
                        EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                        EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
                    ]:
                        elementwise_add_op = self._build_conv_and_ew_add(
                            l_desc, layer_inputs, inter_layer_precision_mode
                        )
                        op, k = self._build_conv(
                            l_desc,
                            inputs,
                            inter_layer_precision_mode,
                            weight_op_idx=weight_op_idx,
                            elementwise=elementwise_add_op,
                        )
                    else:
                        op, k = self._build_conv(
                            l_desc,
                            inputs,
                            inter_layer_precision_mode,
                            weight_op_idx=weight_op_idx,
                            elementwise=None,
                        )

                elif l_desc.op == LayerType.dense:
                    op, k = self._build_dense(l_desc, inp, inter_layer_precision_mode)
                elif l_desc.op == LayerType.batch_norm:
                    if l_desc.ew_add_enabled and self.target in [
                        EmulationInferenceTargets.SDK_NUMERIC,
                        EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                        EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
                    ]:
                        elementwise_add_op = self._build_conv_and_ew_add(
                            l_desc, layer_inputs, inter_layer_precision_mode
                        )
                        op, k = self._build_batchnorm(l_desc, inp, inter_layer_precision_mode, elementwise_add_op)
                    else:
                        op, k = self._build_batchnorm(l_desc, inp, inter_layer_precision_mode)
                elif l_desc.op == LayerType.normalization:
                    if l_desc.ew_add_enabled and self.target in [
                        EmulationInferenceTargets.SDK_NUMERIC,
                        EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                        EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
                    ]:
                        elementwise_add_op = self._build_conv_and_ew_add(
                            l_desc, layer_inputs, inter_layer_precision_mode
                        )
                        op, k = self._build_normalization(l_desc, inp, inter_layer_precision_mode, elementwise_add_op)
                    else:
                        op, k = self._build_normalization(l_desc, inp, inter_layer_precision_mode)
                elif l_desc.op == LayerType.bbox_decoder:
                    op = self._build_bbox_decoder(l_desc, inputs)
                elif l_desc.op == LayerType.fused_bbox_decoder:
                    boxes = self._build_bbox_decoder_single_weight_per_proposal(l_desc, [inputs[0]])
                    scores = self._prepare_scores_for_proposal_generator(l_desc, inputs[1], inter_layer_precision_mode)
                    op = self._build_proposal_generator_layer(
                        l_desc,
                        [boxes, scores],
                        inter_layer_precision_mode,
                        skip_kernel_multiplication=True,
                    )
                elif l_desc.op == LayerType.ew_add:
                    op = self._build_standalone_ew_add_sub(l_desc, inputs, inter_layer_precision_mode, LayerType.ew_add)
                elif l_desc.op == LayerType.ew_sub:
                    op = self._build_standalone_ew_add_sub(l_desc, inputs, inter_layer_precision_mode, LayerType.ew_sub)
                elif l_desc.op == LayerType.ew_mult:
                    if l_desc._ew_mult_type == EWMultType.on_mac:
                        op = self._build_ew_mult_reduce_sum(l_desc, inputs, inter_layer_precision_mode, False)
                    else:
                        op = self._build_ew_mult(l_desc, inputs)
                elif l_desc.op == LayerType.activation:
                    op, k = self._build_standalone_activation(l_desc, inp, inter_layer_precision_mode)
                elif l_desc.op == LayerType.reduce_sum:
                    op, k = self.build_reduce_mean_or_sum_layer(l_desc, inp, inter_layer_precision_mode, tf.reduce_sum)
                elif l_desc.op == LayerType.reduce_mean:
                    op, k = self.build_reduce_mean_or_sum_layer(l_desc, inp, inter_layer_precision_mode, tf.reduce_mean)
                elif l_desc.op == LayerType.matmul:
                    op, k = self._build_matmul(l_desc, inputs, inter_layer_precision_mode)
                elif l_desc.op == LayerType.feature_multiplier:
                    if l_desc._ew_mult_type == EWMultType.on_mac:
                        op = self._build_ew_mult_reduce_sum(l_desc, inputs, inter_layer_precision_mode, True)
                    else:
                        op = self._build_feature_multiplier_layer(l_desc, inp)
                elif l_desc.op == LayerType.resize and is_resize_bilinear_quantized:
                    op, k = self._build_resize(l_desc, inp, inter_layer_precision_mode)
                elif l_desc.op == LayerType.proposal_generator:
                    op = self._build_proposal_generator_layer(l_desc, inputs, inter_layer_precision_mode)
                else:
                    raise BackendNotImplementedError(f"Unknown type of op {l_desc.op}")

                if self.target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC and l_desc.op not in [
                    LayerType.ew_mult,
                    LayerType.feature_multiplier,
                ]:
                    accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, is_16bit_precision_mode)
                    shift = tf.cast(self.emulation_mult_shift, op.dtype)
                    op = tf.divide(op, shift)
                    op = wrap_around(op, accumulator_size)
                native_targets = [
                    EmulationInferenceTargets.SDK_NATIVE,
                    EmulationInferenceTargets.SDK_FP_OPTIMIZED,
                    EmulationInferenceTargets.SDK_FINE_TUNE,
                ]

                if (
                    (
                        l_desc.op
                        in [
                            LayerType.avgpool,
                            LayerType.ew_add,
                            LayerType.ew_sub,
                            LayerType.activation,
                            LayerType.reduce_sum,
                            LayerType.reduce_mean,
                        ]
                        and self.target in native_targets
                    )
                    or (
                        l_desc.op
                        in [
                            LayerType.bbox_decoder,
                            LayerType.fused_bbox_decoder,
                            LayerType.ew_mult,
                            LayerType.matmul,
                            LayerType.feature_multiplier,
                            LayerType.proposal_generator,
                        ]
                    )
                    or l_desc.dynamic_weights
                    or (l_desc.op == LayerType.resize and is_resize_bilinear_quantized)
                ):
                    b_op = op
                else:
                    b_op = self._build_bias(
                        l_desc, op, inter_layer_precision_mode, op_idx=op_idx if use_split_inputs else None
                    )
                    if self.target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC:
                        b_op = wrap_around(b_op, accumulator_size, name="bias_op_wraparound")

                for_act = b_op
                add_op = None

                layer_idx_in_second_input = 0
                if (
                    l_desc.op
                    in {
                        LayerType.conv,
                        LayerType.normalization,
                        LayerType.dw,
                        LayerType.batch_norm,
                    }
                    and self.target
                    not in {
                        EmulationInferenceTargets.SDK_NUMERIC,
                        EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                    }
                    and l_desc.ew_add_enabled
                ):
                    if self.target in [EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
                        shift = tf.cast(self.emulation_mult_shift, op.dtype)
                        elementwise_add_op = tf.divide(elementwise_add_op, shift)
                        accumulator_size = self._get_accumulator_size(
                            inter_layer_precision_mode,
                            is_16bit_precision_mode,
                        )
                        add_op = self._build_add(l_desc, for_act, elementwise_add_op)
                        add_op = wrap_around(add_op, accumulator_size)
                    else:
                        if len(self.nodes[layer_inputs[1]].out) > 1:
                            layer_idx_in_second_input = layer_inputs[1].outputs.index(l_desc.name)
                        add_op = self._build_add(
                            l_desc,
                            for_act,
                            self.nodes[layer_inputs[1]].out[layer_idx_in_second_input],
                        )

                    for_act = add_op
                    layer_inference["elementwise_name"] = (
                        self.nodes[layer_inputs[1]].out[layer_idx_in_second_input].name
                    )
                    layer_inference["elementwise_tensor"] = self.nodes[layer_inputs[1]].out[layer_idx_in_second_input]

                if l_desc.op in [LayerType.ew_add, LayerType.ew_sub, LayerType.ew_mult]:
                    if len(self.nodes[layer_inputs[1]].out) > 1:
                        layer_idx_in_second_input = layer_inputs[1].outputs.index(l_desc.name)
                    layer_inference["elementwise_name"] = (
                        self.nodes[layer_inputs[1]].out[layer_idx_in_second_input].name
                    )
                    layer_inference["elementwise_tensor"] = self.nodes[layer_inputs[1]].out[layer_idx_in_second_input]

                if l_desc.op == LayerType.deconv and self.target in [
                    EmulationInferenceTargets.SDK_NUMERIC,
                    EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                    EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
                ]:
                    # the feature_interleave or depth_to_space layer as part of deconv is relevant only in NUMERIC.
                    # for SDK_NATIVE, in conv_op eventually conv2d_transpose is called, and that is the full native
                    # implementation of deconv (conv + fi or d2s together)
                    if is_super_deconv(l_desc):
                        for_act = self._build_super_deconv_depth_to_space(l_desc, for_act)
                        for_act = self._build_super_deconv_slice(l_desc, for_act)
                    else:
                        for_act = self._build_deconv_feature_interleave(l_desc, for_act)
                    # returns list because of first call

                total_for_act.append(for_act)

            out8_pre_round = None
            act = None
            out_stage_stats = None
            self.conv_layers_inference[self._get_param_key(l_desc.name)]["pre_act_name"] = total_for_act[0].name

            result_inp = tf.zeros_like(total_for_act[0])
            if self._is_16bit_input(l_desc) and self.is_pluto_arch:
                shifts = tf.cast(
                    tf.Variable(
                        initial_value=tf.zeros(len(total_for_act), dtype=tf.int8),
                        dtype=tf.int8,
                        trainable=False,
                        name="pre_act_sum_shift",
                    ),
                    dtype=tf.float32,
                )

                # TODO should fsm implementation move to activation.cc ?
                for i in range(len(total_for_act)):
                    shifted_tensor = total_for_act[i] * (2 ** shifts[i])
                    result_inp += shifted_tensor
            else:
                result_inp = total_for_act[0]

            if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
                post_act, out8_pre_round, out_stage_stats = self._build_output_stage(
                    result_inp,
                    l_desc,
                    inter_layer_precision_mode,
                )
            else:
                if self.activation_callback is not None:
                    post_act = self.activation_callback(l_desc.activation, result_inp)
                else:
                    post_act = self._build_activation(
                        l_desc.activation,
                        result_inp,
                        l_desc,
                        inter_layer_precision_mode,
                    )
                act = post_act

            out = post_act
            if l_desc in self.mixed_numeric_layers:
                out = self._from_numeric(out)

            layer_inference["output_tensor"] = out
            layer_inference["output_name"] = out.name

            stats = None
            if self.target != EmulationInferenceTargets.SDK_NATIVE:
                # gather statistics for quantization
                stats = self._gather_statistics(l_desc, result_inp, out_stage_stats)

            activation_histogram = self._activation_histograms.get(l_desc.name, None)

            if l_desc.op == LayerType.avgpool:
                k = None
                if l_desc.is_global_avg_pool():
                    # update the global avg pool shape to be rank 2, similar to the hw
                    succ = next(iter(self.hailo_nn.successors(l_desc)))
                    if succ.op in (LayerType.dense, LayerType.softmax):
                        shape = [-1] + l_desc.output_shape[3:]
                        out = tf.reshape(out, shape=shape)

            # handle special cases where output is transformed (currently supports two types)
            if (
                l_desc.op
                in [LayerType.batch_norm, LayerType.normalization, LayerType.dw, LayerType.activation, LayerType.conv]
                and l_desc.transpose_output_width_features
            ):
                out = tf.transpose(a=out, perm=[0, 1, 3, 2])
            elif l_desc.spatial_flatten_output:
                out = tf.reshape(tensor=out, shape=l_desc.output_shape)

            if l_desc.op == LayerType.feature_multiplier:
                # Cut each tensor in feature dim by output shape
                output_features = [output_shape[-1] for output_shape in l_desc.output_shapes]
                out = tf.split(out, num_or_size_splits=output_features, axis=-1)
            else:
                out = [out]

            self.nodes[l_desc] = TFNode(
                l_desc.name,
                tf.identity(inp),
                op,
                b_op,
                add_op,
                for_act,
                act,
                out,
                k=k,
                out8_pre_round=out8_pre_round,
                pre_act=for_act,
                post_act=post_act,
                stats=stats,
                activation_histogram=activation_histogram,
            )

            return self.nodes[l_desc].out

    def _handle_layers_with_sdk_mixed(self, inputs, l_desc, qp_in):
        limvals_in = tf.Variable(initial_value=tf.zeros(2), dtype=tf.float32, trainable=False, name="limvals_in")
        # we get all the inputs and translate them back to numeric if needed.
        is_ew_add_enabled = hasattr(l_desc, "ew_add_enabled") and l_desc.ew_add_enabled
        needs_outputs_of_previous_layers = (
            l_desc.op in {LayerType.ew_add, LayerType.ew_sub, LayerType.ew_mult, LayerType.matmul}
        ) or l_desc.dynamic_weights
        for i in range(len(inputs)):
            if is_ew_add_enabled and i == 1:
                # we will not translate the add part of the elemntwise_add also we can in the future
                continue
            else:
                if needs_outputs_of_previous_layers:
                    prev_layer_name = inputs[i].name.split("/")[1]
                    scope = self._get_scope_from_the_name_of_tensor_or_layer(inputs[i])
                    qp_in = self._get_variable_with_scope(
                        variable_name="qp_out",
                        layer_name=prev_layer_name,
                        scope=scope,
                    )
                    limvals_in = self._get_variable_with_scope(
                        variable_name="limvals_out",
                        layer_name=prev_layer_name,
                        scope=scope,
                    )
                inputs[i] = self._to_numeric(inputs[i], qp_in, limvals_in)
        return inputs

    def build_min_max_out(self, tensor, output_min_max_strategy, layer_outputs):
        if output_min_max_strategy != OutputMinMaxStrategy.default:
            for layer in layer_outputs:
                if layer.op != LayerType.output_layer:
                    raise BackendEmulatorException(
                        "non-default OutputMinMaxStrategy currently supported in last layers only",
                    )
        if output_min_max_strategy == OutputMinMaxStrategy.default:
            stats_min_out = tf.reduce_min(input_tensor=tensor, name="stats_min_out")
            stats_max_out = tf.reduce_max(input_tensor=tensor, name="stats_max_out")
        elif output_min_max_strategy == OutputMinMaxStrategy.sigmoid:
            stats_min_out = tf.constant(-4, name="stats_min_out")
            stats_max_out = tf.constant(4, name="stats_max_out")
        elif output_min_max_strategy == OutputMinMaxStrategy.softmax:
            stats_max_out = tf.reduce_max(input_tensor=tensor, name="stats_max_out")
            stats_min_out = tf.reduce_max(input_tensor=tensor, axis=tensor.shape._ndims - 1)
            while stats_min_out.shape._ndims > 1:
                stats_min_out = tf.reduce_max(input_tensor=stats_min_out, axis=stats_min_out.shape._ndims - 1)
            stats_min_out = tf.subtract(tf.reduce_min(input_tensor=stats_min_out), 12, name="stats_min_out")
        else:
            raise BackendEmulatorException("unknown output min/max strategy")

        return stats_min_out, stats_max_out

    def _gather_statistics(self, l_desc: Layer, pre_act_tensor, out_stage_stats):
        layer_outputs = list(self.hailo_nn.successors(l_desc))
        # TODO: do we need additional states for sigmoid / softmax strategies?
        if l_desc.op == LayerType.softmax:
            output_min_max_strategy = OutputMinMaxStrategy.softmax
        else:
            output_min_max_strategy = OutputMinMaxStrategy.default
        layer_inference = self.conv_layers_inference[self._get_param_key(l_desc.name)]

        if "elementwise_name" in layer_inference:
            stats_min_elementwise = tf.reduce_min(
                input_tensor=layer_inference["elementwise_tensor"],
                name="stats_min_elwa",
            )
            stats_max_elementwise = tf.reduce_max(
                input_tensor=layer_inference["elementwise_tensor"],
                name="stats_max_elwa",
            )
            stats_min_elementwise_features = layer_inference["elementwise_tensor"]
            stats_max_elementwise_features = layer_inference["elementwise_tensor"]

            while stats_max_elementwise_features.shape.ndims > 2:
                stats_min_elementwise_features = tf.reduce_min(input_tensor=stats_min_elementwise_features, axis=0)
                stats_max_elementwise_features = tf.reduce_max(input_tensor=stats_max_elementwise_features, axis=0)
            stats_min_elementwise_features = tf.reduce_min(
                input_tensor=stats_min_elementwise_features,
                axis=0,
                name="stats_min_out_features_elwa",
            )
            stats_max_elementwise_features = tf.reduce_max(
                input_tensor=stats_max_elementwise_features,
                axis=0,
                name="stats_max_out_features_elwa",
            )

            layer_inference["stats_min_elementwise_name"] = stats_min_elementwise.name
            layer_inference["stats_max_elementwise_name"] = stats_max_elementwise.name
            layer_inference["stats_min_elementwise_tensor"] = stats_min_elementwise
            layer_inference["stats_max_elementwise_tensor"] = stats_max_elementwise
            layer_inference["stats_min_elementwise_features_name"] = stats_min_elementwise_features.name
            layer_inference["stats_max_elementwise_features_name"] = stats_max_elementwise_features.name
            layer_inference["stats_min_elementwise_features_tensor"] = stats_min_elementwise_features
            layer_inference["stats_max_elementwise_features_tensor"] = stats_max_elementwise_features
        else:
            stats_min_elementwise = None
            stats_max_elementwise = None
            stats_min_elementwise_features = None
            stats_max_elementwise_features = None

        if "weights_input_name" in layer_inference:
            stats_min_weights_input = tf.reduce_min(
                input_tensor=layer_inference["weights_input_tensor"],
                name="stats_min_weights_in",
            )
            stats_max_weights_input = tf.reduce_max(
                input_tensor=layer_inference["weights_input_tensor"],
                name="stats_max_weights_in",
            )
            layer_inference["stats_min_weights_input_name"] = stats_min_weights_input.name
            layer_inference["stats_max_weights_input_name"] = stats_max_weights_input.name
            layer_inference["stats_min_weights_input_tensor"] = stats_min_weights_input
            layer_inference["stats_max_weights_input_tensor"] = stats_max_weights_input
        else:
            stats_min_weights_input = None
            stats_max_weights_input = None

        stats_min_inp = tf.reduce_min(input_tensor=layer_inference["input_tensor"], name="stats_min_inp")
        stats_max_inp = tf.reduce_max(input_tensor=layer_inference["input_tensor"], name="stats_max_inp")
        stats_min_out, stats_max_out = self.build_min_max_out(
            layer_inference["output_tensor"],
            output_min_max_strategy,
            layer_outputs,
        )
        stats_min_pre_act = tf.reduce_min(input_tensor=pre_act_tensor, name="stats_min_pre_act")
        stats_max_pre_act = tf.reduce_max(input_tensor=pre_act_tensor, name="stats_max_pre_act")
        pre_act_tensor_non_features_axis = tuple(range(pre_act_tensor.shape.ndims)[:-1])
        input_tensor_non_features_axis = tuple(range(layer_inference["input_tensor"].shape.ndims)[:-1])

        stats_min_pre_act_features = tf.reduce_min(
            input_tensor=pre_act_tensor,
            axis=pre_act_tensor_non_features_axis,
            name="stats_min_pre_act_features",
        )
        stats_max_pre_act_features = tf.reduce_max(
            input_tensor=pre_act_tensor,
            axis=pre_act_tensor_non_features_axis,
            name="stats_max_pre_act_features",
        )
        stats_energy_out_features = tf.reduce_mean(
            input_tensor=tf.square(layer_inference["output_tensor"]),
            axis=pre_act_tensor_non_features_axis,
            name="stats_output_energy_features",
        )
        stats_energy_in_features = tf.reduce_mean(
            input_tensor=tf.square(layer_inference["input_tensor"]),
            axis=input_tensor_non_features_axis,
            name="stats_input_energy_features",
        )
        stats_energy_pre_features = tf.reduce_mean(
            input_tensor=tf.square(pre_act_tensor),
            axis=pre_act_tensor_non_features_axis,
            name="stats_pre_energy_features",
        )

        zero = tf.constant(0, dtype=layer_inference["output_tensor"].dtype)
        mask = tf.cast(
            tf.not_equal(layer_inference["output_tensor"], zero),
            dtype=layer_inference["output_tensor"].dtype,
        )

        stats_non_zero_percent_features = tf.reduce_mean(input_tensor=mask, name="stats_non_zero_percent_features")

        stats_min_out_features = layer_inference["output_tensor"]
        stats_max_out_features = layer_inference["output_tensor"]

        stats_min_out_features = tf.reduce_min(
            input_tensor=stats_min_out_features,
            axis=0,
            name="stats_min_out_features",
        )
        stats_max_out_features = tf.reduce_max(
            input_tensor=stats_max_out_features,
            axis=0,
            name="stats_max_out_features",
        )

        layer_inference["stats_min_input_name"] = stats_min_inp.name
        layer_inference["stats_max_input_name"] = stats_max_inp.name
        layer_inference["stats_min_output_name"] = stats_min_out.name
        layer_inference["stats_max_output_name"] = stats_max_out.name
        layer_inference["stats_min_output_features_name"] = stats_min_out_features.name
        layer_inference["stats_max_output_features_name"] = stats_max_out_features.name
        layer_inference["stats_min_pre_act_features_name"] = stats_min_pre_act_features.name
        layer_inference["stats_max_pre_act_features_name"] = stats_max_pre_act_features.name
        layer_inference["stats_output_energy_features_name"] = stats_energy_out_features.name
        layer_inference["stats_input_energy_features_name"] = stats_energy_in_features.name
        layer_inference["stats_pre_energy_features_name"] = stats_energy_pre_features.name
        layer_inference["stats_min_pre_act_name"] = stats_min_pre_act.name
        layer_inference["stats_max_pre_act_name"] = stats_max_pre_act.name
        layer_inference["stats_non_zero_percent_features_name"] = stats_non_zero_percent_features.name
        layer_inference["stats_min_input_tensor"] = stats_min_inp
        layer_inference["stats_max_input_tensor"] = stats_max_inp
        layer_inference["stats_min_output_tensor"] = stats_min_out
        layer_inference["stats_max_output_tensor"] = stats_max_out
        layer_inference["stats_min_pre_act_tensor"] = stats_min_pre_act
        layer_inference["stats_max_pre_act_tensor"] = stats_max_pre_act
        layer_inference["stats_min_output_features_tensor"] = stats_min_out_features
        layer_inference["stats_max_output_features_tensor"] = stats_max_out_features
        layer_inference["stats_min_pre_act_features_tensor"] = stats_min_pre_act_features
        layer_inference["stats_max_pre_act_features_tensor"] = stats_max_pre_act_features
        layer_inference["stats_output_energy_features_tensor"] = stats_energy_out_features
        layer_inference["stats_input_energy_features_tensor"] = stats_energy_in_features
        layer_inference["stats_pre_energy_features_tensor"] = stats_energy_pre_features
        layer_inference["stats_non_zero_percent_features_tensor"] = stats_non_zero_percent_features

        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
            out_clipped_values_min = out_stage_stats[0]
            out_clipped_values_max = out_stage_stats[1]
            layer_inference["stats_out_clipped_values_min_name"] = out_clipped_values_min.name
            layer_inference["stats_out_clipped_values_max_name"] = out_clipped_values_max.name
            layer_inference["stats_out_clipped_values_min_tensor"] = out_clipped_values_min
            layer_inference["stats_out_clipped_values_max_tensor"] = out_clipped_values_max
        else:
            out_clipped_values_min = None
            out_clipped_values_max = None

        return TFStats(
            input_min=stats_min_inp,
            input_max=stats_max_inp,
            output_min=stats_min_out,
            output_max=stats_max_out,
            pre_act_min=stats_min_pre_act,
            pre_act_max=stats_max_pre_act,
            elementwise_min=stats_min_elementwise,
            elementwise_max=stats_max_elementwise,
            output_min_features=stats_min_out_features,
            output_max_features=stats_max_out_features,
            stats_min_elementwise_features=stats_min_elementwise_features,
            stats_max_elementwise_features=stats_max_elementwise_features,
            stats_min_pre_act_features=stats_min_pre_act_features,
            stats_max_pre_act_features=stats_max_pre_act_features,
            stats_energy_out_features=stats_energy_out_features,
            stats_energy_in_features=stats_energy_in_features,
            stats_energy_pre_features=stats_energy_pre_features,
            out_clipped_values_min=out_clipped_values_min,
            out_clipped_values_max=out_clipped_values_max,
            stats_min_weights_input=stats_min_weights_input,
            stats_max_weights_input=stats_max_weights_input,
            stats_non_zero_percent_features=stats_non_zero_percent_features,
        )

    def _build_input_layer(self, l_desc):
        is_16bit_precision_mode = self._is_16bit_input(l_desc)
        shape = copy.deepcopy(l_desc.input_shape)
        shape[0] = None
        if l_desc.transposed and len(shape) == 4:
            shape[1], shape[2] = shape[2], shape[1]
        is_real_input = l_desc.op == LayerType.input_layer and l_desc.is_real_io
        original_name = (
            "Placeholder"
            if l_desc.original_names is None or len(l_desc.original_names) < 1
            else l_desc.original_names[-1].split(":")[0]
        )
        if (
            self.target
            in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
            ]
            and self.translate_input
            and l_desc.op != LayerType.const_input
            and is_real_input
        ):
            with self.g.as_default():
                scope = self._get_scope_from_the_name_of_tensor_or_layer(l_desc)
                qp_out = self._get_variable_with_scope(
                    variable_name="qp_out",
                    layer_name=l_desc.name_without_scope,
                    scope=scope,
                )
                limvals_out = self._get_variable_with_scope(
                    variable_name="limvals_out",
                    layer_name=l_desc.name_without_scope,
                    scope=scope,
                )

                with tf.compat.v1.variable_scope(l_desc.name_without_scope) as self.current_scope:
                    inp_shape = self._input_shape_by_conversion(l_desc, shape)
                    if self._custom_inputs and l_desc.name in self._custom_inputs:
                        inp = tf.identity(self._custom_inputs[l_desc.name], name=original_name)
                        inp.set_shape(inp_shape)
                    else:
                        inp = tf.keras.Input(dtype=tf.float32, shape=inp_shape[1:], name=original_name)
                    last_layer = self._get_conversion_layer(l_desc, inp, shape)
                    inp_c = tf.clip_by_value(last_layer, limvals_out[0], limvals_out[1])
                    out = tf.math.rint(tf.add(tf.divide(inp_c, qp_out[1]), qp_out[0]))
                    if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                        out = tf.cast(out, tf.int32)
        else:
            with tf.compat.v1.variable_scope(l_desc.name_without_scope) as self.current_scope:
                if l_desc.op == LayerType.input_layer:
                    inp_shape = self._input_shape_by_conversion(l_desc, shape)
                    if self._custom_inputs and l_desc.name in self._custom_inputs:
                        inp = tf.identity(self._custom_inputs[l_desc.name], name=original_name)
                        inp.set_shape(inp_shape)
                    else:
                        inp = tf.keras.Input(dtype=tf.float32, shape=inp_shape[1:], name=original_name)
                    out = self._get_conversion_layer(l_desc, inp, shape)

                elif l_desc.op == LayerType.const_input:
                    assert len(self._input_nodes) > 0
                    real_input = self._input_nodes[next(iter(self._input_nodes.keys()))]
                    inp = tf.Variable(tf.zeros(shape[1:]), trainable=False, name="const_data")
                    inp = tf.expand_dims(inp, axis=0)

                    def repeat_const(tensor, myconst):
                        shape = tf.shape(tensor)
                        return tf.repeat(myconst, shape[0], axis=0)

                    inp = tf.keras.layers.Lambda(lambda x: repeat_const(x, inp))(real_input)
                    spare_tiling = [1] if len(l_desc.input_tiles[0]) == 3 else []
                    out = tf.tile(inp, spare_tiling + l_desc.input_tiles[0])

                if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                    out = tf.cast(out, tf.int32)

        if l_desc.transposed and len(shape) == 4:
            with tf.compat.v1.variable_scope(self.current_scope.name + "/"):
                transposed_shape = np.arange(len(shape))
                transposed_shape[1], transposed_shape[2] = transposed_shape[2], transposed_shape[1]
                out = tf.transpose(a=out, perm=transposed_shape)
        inp_node = TFNode(l_desc.name, inp, None, None, None, None, None, [out])
        self.nodes[l_desc] = inp_node
        if l_desc.op != LayerType.const_input:
            self._input_nodes[inp_node.name] = inp_node.inp
        self.conv_layers_inference[self._get_param_key(l_desc.name)] = {}
        layer_inference = self.conv_layers_inference[self._get_param_key(l_desc.name)]
        layer_inference["dummy_conv"] = None
        layer_inference["input_tensor"] = inp
        layer_inference["output_tensor"] = out
        layer_inference["output_name"] = out.name

        return shape

    def _input_shape_by_conversion(self, l_desc, shape):
        if l_desc.emulate_conversion:
            if l_desc.conversion_type == FormatConversionType.yuy2_to_hailo_yuv:
                return [shape[0], shape[1], shape[2], 2]
            if l_desc.conversion_type == FormatConversionType.tf_rgbx_to_hailo_rgb:
                return [shape[0], shape[1], shape[2], 4]
            if l_desc.conversion_type == FormatConversionType.nv12_to_hailo_yuv:
                return [shape[0], shape[1] // 2, shape[2], 3]
        return shape

    def _get_conversion_layer(self, l_desc, inp, shape):
        if l_desc.emulate_conversion:
            if l_desc.conversion_type == FormatConversionType.yuy2_to_hailo_yuv:
                return yuy2_to_yuv_conversion(inp, shape)
            if l_desc.conversion_type == FormatConversionType.tf_rgbx_to_hailo_rgb:
                return rgbx_to_rgb_conversion(inp, shape)
            if l_desc.conversion_type == FormatConversionType.nv12_to_hailo_yuv:
                return nv12_to_yuv_conversion(inp, shape)
        return inp

    def _transpose_output_layer(self, l_desc, inp):
        layer_input = self._find_first_non_mux_layer(l_desc)
        transposed_order = np.arange(len(inp.shape))
        transposed_order[1], transposed_order[2] = transposed_order[2], transposed_order[1]
        transposed_out = tf.transpose(a=inp, perm=transposed_order)
        self.nodes[layer_input].out = [transposed_out]
        return transposed_out

    def _build_output_layer(self, l_desc, inp):
        inp_layer_name = "/".join(inp.name.split("/")[:-1])
        if l_desc.transposed and len(inp.shape) >= 3:
            with tf.compat.v1.variable_scope(inp_layer_name + "/"):
                inp = self._transpose_output_layer(l_desc, inp)
        output = tf.identity(inp, name="output")
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [output])
        self.conv_layers_inference[self._get_param_key(l_desc.name)] = {}
        layer_inference = self.conv_layers_inference[self._get_param_key(l_desc.name)]
        layer_inference["dummy_conv"] = None
        layer_inference["input_tensor"] = inp
        layer_inference["output_tensor"] = output
        layer_inference["output_name"] = output.name
        return output

    def _is_activation_required(self, l_desc):
        return l_desc.op not in [
            LayerType.input_layer,
            LayerType.const_input,
            LayerType.output_layer,
            LayerType.output_mux,
            LayerType.concat,
            LayerType.maxpool,
            LayerType.resize,
            LayerType.shortcut,
            LayerType.portal,
            LayerType.nms,
            LayerType.external_pad,
            LayerType.format_conversion,
            LayerType.feature_interleave,
            LayerType.depth_to_space,
            LayerType.space_to_depth,
            LayerType.slice,
            LayerType.argmax,
            LayerType.reduce_max,
            LayerType.feature_shuffle,
            LayerType.softmax,
            LayerType.feature_splitter,
            LayerType.row_splitter,
            LayerType.width_splitter,
            LayerType.precision_splitter,
            LayerType.layer_normalization,
            LayerType.postprocess,
        ]

    def _is_int32_numeric(self, l_desc, is_16bit_precision_mode):
        if self.is_pluto_arch and self._is_activation_required(l_desc):
            # 16bit precision is implemented differenty in pluto
            return False
        return (self.run_numeric_in_int32 or is_16bit_precision_mode) and self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
        ]

    def _build_conv_and_ew_add(self, l_desc, layer_inputs, inter_layer_precision_mode):
        is_16bit_input = self._is_16bit_input(l_desc)
        accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, is_16bit_input)

        with tf.compat.v1.variable_scope("elementwise_addition"):
            input_factor = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name="input_factor")
            feed_repeat = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name="feed_repeat")

            if len(self.nodes[layer_inputs[1]].out) > 1:
                layer_idx_in_second_input = layer_inputs[1].outputs.index(l_desc.name)
            else:
                layer_idx_in_second_input = 0
            # In case pred has more than one output
            elementwise_in = self.nodes[layer_inputs[1]].out[layer_idx_in_second_input]
            z = tf.zeros_like(elementwise_in)
            if l_desc in self.mixed_numeric_layers:
                qp_in = tf.Variable(initial_value=tf.zeros(2), dtype=tf.float32, trainable=False, name="qp_elwa")
                limvals_in = tf.Variable(
                    initial_value=tf.zeros(2),
                    dtype=tf.float32,
                    trainable=False,
                    name="limvals_elwa",
                )
                elementwise_in = self._to_numeric(elementwise_in, qp_in, limvals_in)
            if self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            ]:
                if self._is_int32_numeric(l_desc, is_16bit_input):
                    input_factor = tf.cast(input_factor, tf.int32)
                elementwise = self.hsim.h_add(
                    z,
                    elementwise_in,
                    input_factor,
                    feed_repeat,
                    self.emulation_mult_shift,
                    accumulator_size=accumulator_size,
                    use_fp16_acc=False,
                    name="elemenwise_op",
                )
            else:
                elementwise = tf.scalar_mul(input_factor, elementwise_in)
                elementwise = tf.scalar_mul(feed_repeat, elementwise)

            self.conv_layers_inference[self._get_param_key(l_desc.name)]["elementwise_name"] = elementwise_in.name
            self.conv_layers_inference[self._get_param_key(l_desc.name)]["elementwise_tensor"] = elementwise_in
            return elementwise

    def _build_concat_layer(self, l_desc, concat_inputs):
        # By now all input-graph nodes for concat layer should be declared on graph already.
        # Get all input to concat from the nodes
        if l_desc.is_from_dense() or l_desc.axis == ConcatAxis.spatial_h:
            axis = 1
        elif l_desc.axis == ConcatAxis.spatial_w:
            axis = 2
        else:
            axis = 3

        if l_desc.group_sizes is None or len(l_desc.group_sizes) == 1:
            concat_inputs_by_group = concat_inputs
        else:
            # For group concat we slice each concat input to group in the given ratio and concat first all the first
            # slices, second slices, and so on.
            # For example, for concat with two inputs with 6 inputs for the first input and 3 for the second and group
            # sizes of [1,2] the concat result will be:
            # tf.concat(inp1[:,:,:,2], inp2[:,:,:,:1], inp1[:,:,:,2:], inp2[:,:,:,1:], axis=3)
            if axis == 2:
                raise BackendEmulatorException("concat_op does not support asymmetric groups and spatial axis")
            concat_inputs_by_group = []
            total_size = sum(l_desc.group_sizes)
            for i, group_size in enumerate(l_desc.group_sizes):
                prev_group_sizes_sum = sum(l_desc.group_sizes[:i])
                for inp in concat_inputs:
                    input_size = inp.shape[axis]
                    size_for_group = input_size // total_size
                    size_for_previous_groups = size_for_group * prev_group_sizes_sum
                    size_for_curr_group = size_for_group * group_size
                    start = size_for_previous_groups
                    end = start + size_for_curr_group
                    if axis == 3:
                        concat_inputs_by_group.append(inp[:, :, :, start:end])
                    elif axis == 1:
                        concat_inputs_by_group.append(inp[:, start:end, :, :])

        concat = tf.concat(concat_inputs_by_group, axis, name="op")
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [concat])
        return concat

    def _build_proposal_generator_layer(
        self,
        l_desc,
        prop_inputs,
        inter_layer_precision_mode,
        skip_kernel_multiplication=False,
    ):
        boxes_full = prop_inputs[0]
        scores_full = prop_inputs[1]

        concat_proposals_out = []
        group_size = int(boxes_full.shape[1] / l_desc.input_division_factor)
        for n in range(l_desc.input_division_factor):
            boxes = boxes_full[:, (n * group_size) : ((n + 1) * group_size), :, :]
            scores = scores_full[:, (n * group_size) : ((n + 1) * group_size), :, :]
            padded_width = int(np.ceil(boxes.shape[2] / l_desc.proposals_per_output) * l_desc.proposals_per_output)
            anchors = int(boxes.shape[3] / l_desc.number_of_coordinates_per_proposal)
            classes = int(scores.shape[3] / anchors)
            total_proposals = int(anchors * boxes.shape[1] * padded_width)
            prop_gen_outputs = int(total_proposals / l_desc.proposals_per_output)

            pad = tf.zeros_like(boxes)
            pad = pad[:, :, 0 : (padded_width - boxes.shape[2]), :]
            if boxes.shape[2] == 1:
                pad = tf.tile(pad, [1, 1, 3, 1])
            boxes_padded = tf.concat([boxes, pad], 2)
            boxes_reshaped = tf.reshape(
                boxes_padded,
                [-1, 1, total_proposals, l_desc.number_of_coordinates_per_proposal],
            )
            boxes_reshaped = tf.tile(boxes_reshaped, [1, classes, 1, 1])

            pad = tf.zeros_like(scores)
            pad = pad[:, :, 0 : (padded_width - scores.shape[2]), :]
            if scores.shape[2] == 1:
                pad = tf.tile(pad, [1, 1, 3, 1])
            scores_padded = tf.concat([scores, pad], 2)
            scores_reshaped = tf.reshape(scores_padded, [-1, scores.shape[1], padded_width, anchors, classes])
            scores_reshaped = tf.transpose(a=scores_reshaped, perm=[0, 4, 1, 2, 3])
            scores_reshaped = tf.reshape(scores_reshaped, [-1, classes, total_proposals, 1])
            proposals = tf.concat([boxes_reshaped, scores_reshaped], 3)
            proposals = tf.reshape(
                proposals,
                [
                    -1,
                    boxes.shape[1],
                    int(padded_width / l_desc.number_of_coordinates_per_proposal),
                    l_desc.number_of_coordinates_per_proposal,
                    anchors,
                    l_desc.values_per_proposal,
                ],
            )
            proposals = tf.transpose(a=proposals, perm=[0, 2, 1, 4, 3, 5])
            proposals_out = tf.reshape(
                proposals,
                [-1, classes, prop_gen_outputs, l_desc.number_of_coordinates_per_proposal, l_desc.values_per_proposal],
            )
            proposals_out = tf.transpose(a=proposals_out, perm=[0, 1, 3, 2, 4])
            proposals_out = tf.reshape(
                proposals_out,
                [-1, classes, l_desc.number_of_coordinates_per_proposal, l_desc.values_per_proposal * prop_gen_outputs],
            )
            concat_proposals_out.append(proposals_out)
        if len(concat_proposals_out) > 1:
            proposals_out = tf.concat(concat_proposals_out, 1, name="proposal_out")

        if not skip_kernel_multiplication and self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
        ]:
            k, k_var = self._build_kernel(
                tf.Variable(tf.zeros([1, 1, proposals_out.shape[3], 1]), name="kernel"),
                l_desc,
            )
            self._kernel_variables[l_desc.name] = k_var
            if self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            ]:
                k = tf.transpose(a=k, perm=[0, 1, 3, 2])

            return self._build_1x1_depthwise(
                proposals_out,
                k,
                l_desc,
                inter_layer_precision_mode,
                "proposal_activation_op",
            )
        else:
            return proposals_out

    def _build_pool(self, l_desc, inp, tf_callback):
        (pad_beg_h, pad_end_h, pad_beg_w, pad_end_w) = calculate_padding(
            l_desc.padding,
            l_desc.kernel_shape[1],
            l_desc.kernel_shape[2],
            l_desc.strides[1],
            l_desc.strides[2],
            l_desc.input_shape[1],
            l_desc.input_shape[2],
        )

        constant_values = self._get_const("padding_const_value", np.array(0.0, dtype=np.float32))
        inp_p = tf.pad(
            tensor=inp,
            paddings=[[0, 0], [pad_beg_h, pad_end_h], [pad_beg_w, pad_end_w], [0, 0]],
            mode="CONSTANT",
            constant_values=constant_values,
        )

        mp = tf_callback(
            inp_p,
            l_desc.kernel_shape,
            l_desc.strides,
            "VALID",
            name="op",
        )
        if l_desc in self.mixed_numeric_layers:
            mp = self._from_numeric(mp)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [mp])
        return mp

    def _build_resize(self, l_desc, inp, inter_layer_precision_mode=None):
        if len(inp.shape) == 2:
            input_features = int(inp.shape[1])
            inp = tf.reshape(inp, [-1, 1, 1, input_features])
            l_desc.input_shape = [-1, 1, 1, input_features]

        new_height = np.prod(l_desc.h_ratios, initial=l_desc.input_shape[1])
        new_width = np.prod(l_desc.w_ratios, initial=l_desc.input_shape[2])
        new_features = int(l_desc.f_ratios[0])

        # For some ratios we will lose precision by multiplying and new_height/new_width will have a slight difference from the integer value
        resize_ratios_thresh = 0.005
        if (
            np.abs(new_height - int(np.round(new_height))) > resize_ratios_thresh
            or np.abs(new_width - int(np.round(new_width))) > resize_ratios_thresh
        ):
            raise BackendEmulatorException(f"Resize requires integer dimensions. Recvied [{new_height}, {new_width}]")

        k = None
        output_size = (int(np.round(new_height)), int(np.round(new_width)))
        align_corners = l_desc.resize_bilinear_pixels_mode.value == ResizeBilinearPixelsMode.align_corners.value
        if l_desc._method == ResizeMethod.nearest_neighbor:
            half_pixels = l_desc.is_nearest_half_pixels
            op, _ = self._build_split_resize(
                inp,
                l_desc.h_ratios,
                l_desc.w_ratios,
                l_desc._method,
                half_pixels=half_pixels,
                align_corners=align_corners,
            )
            if new_features > 1:
                op = tf.repeat(op, repeats=new_features, axis=3)
                # this reshape is to fix the 'None' in the last dim
                op = tf.reshape(op, [-1, op.shape[1], op.shape[2], inp.shape[3] * new_features])

        elif l_desc._method == ResizeMethod.bilinear:
            half_pixels = l_desc.is_bilinear_half_pixels
            if self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
            ]:
                is_16_bit_precision = l_desc.precision_config.precision_mode in [
                    PrecisionMode.a16_w16,
                    PrecisionMode.a16_w16_a16,
                    PrecisionMode.a16_w16_a8,
                ]
                op, k = self._build_split_resize(
                    inp,
                    l_desc.h_ratios,
                    l_desc.w_ratios,
                    l_desc._method,
                    align_corners,
                    half_pixels,
                    inter_layer_precision_mode,
                    is_16_bit_precision,
                    self._is_int32_numeric(l_desc, is_16_bit_precision),
                )
            else:
                op = tf.compat.v1.image.resize_bilinear(
                    inp,
                    output_size,
                    align_corners=align_corners,
                    half_pixel_centers=half_pixels,
                )
        else:
            raise BackendEmulatorException(f"Unsupported resize method {l_desc._method} for node {l_desc.name}")

        if l_desc in self.mixed_numeric_layers:
            op = self._from_numeric(op)

        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [op])
        return op, k

    def _build_nms(self, l_desc, inp, qp_in):
        def _single_image_nms_fn(args):
            per_image_boxes = args[0]
            per_image_scores = args[1]
            per_image_orig_input = args[2]
            th_ind = tf.squeeze(tf.compat.v1.where(per_image_scores > l_desc.scores_threshold), axis=1)
            per_image_boxes_th = tf.gather(per_image_boxes, th_ind, axis=0)
            per_image_scores_th = tf.gather(per_image_scores, th_ind, axis=0)
            per_image_orig_input_th = tf.gather(per_image_orig_input, th_ind, axis=0)
            op_ind = tf.image.non_max_suppression(
                per_image_boxes_th,
                per_image_scores_th,
                l_desc.max_output_size,
                l_desc.iou_threshold,
            )
            op_ind_shape = tf.shape(input=op_ind)
            op_one_im = tf.gather(per_image_orig_input_th, op_ind, axis=0)
            return tf.pad(
                tensor=op_one_im,
                paddings=([0, l_desc.max_output_size - op_ind_shape[0]], [0, 0], [0, 0]),
                mode="CONSTANT",
                constant_values=0,
            )

        def _single_image_mat_mul_fn(args):
            per_image_inp = args[0]
            two_bytes_mat = np.kron(np.eye(8), [[1], [256]])[0:8, 0:4]
            two_bytes_mat = tf.constant(two_bytes_mat, dtype=tf.float32)
            two_bytes_inp = tf.matmul(tf.squeeze(per_image_inp), two_bytes_mat)
            return tf.expand_dims(two_bytes_inp, axis=0)

        is_16bit_precision_mode = self._is_16bit_input(l_desc)
        factor = l_desc.input_division_factor
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            scores_threshold_quantized = tf.math.rint(tf.add(tf.divide(l_desc.scores_threshold, qp_in[1]), qp_in[0]))
            # input from proposal generator
            if inp.dtype is not tf.float32:
                inp = tf.cast(inp, tf.float32)
            cinp = inp[:, 0:1, :, :]
            number_of_classes = int(inp.shape[1])
            for i in range(1, factor):
                ind = i * int(number_of_classes // factor)
                cinp = tf.concat([cinp, inp[:, ind : ind + 1, :, :]], axis=3)
            # stack all boxes from all classes (since hsim gets [-1, #total proposals, 5, 1])

            for class_ind in range(1, int(number_of_classes / factor)):
                for i in range(factor):
                    ind = class_ind + i * int(number_of_classes // factor)
                    cinp = tf.concat([cinp, inp[:, ind : ind + 1, :, :]], axis=3)
            inp = tf.transpose(a=cinp, perm=[0, 1, 3, 2])
            # 8 cells into 4 cells (16 bytes concatenation)
            if inp.shape[3] == 8:
                inp = tf.map_fn(
                    _single_image_mat_mul_fn,
                    (inp),
                    dtype=(tf.float32),
                    parallel_iterations=32,
                    back_prop=False,
                )

            # divide each 5X4 bbox table into 4 bbox rows
            inp_shapes = inp.shape
            y_min = tf.reshape(inp[:, :, 0::5, :], [tf.shape(input=inp)[0], inp_shapes[2] // 5 * inp_shapes[3], 1, 1])
            x_min = tf.reshape(inp[:, :, 1::5, :], [tf.shape(input=inp)[0], inp_shapes[2] // 5 * inp_shapes[3], 1, 1])
            y_max = tf.reshape(inp[:, :, 2::5, :], [tf.shape(input=inp)[0], inp_shapes[2] // 5 * inp_shapes[3], 1, 1])
            x_max = tf.reshape(inp[:, :, 3::5, :], [tf.shape(input=inp)[0], inp_shapes[2] // 5 * inp_shapes[3], 1, 1])
            scores = tf.reshape(inp[:, :, 4::5, :], [tf.shape(input=inp)[0], inp_shapes[2] // 5 * inp_shapes[3], 1, 1])
            boxes = tf.concat([y_min, x_min, y_max, x_max], axis=2)
            if self.target != EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC:
                boxes = tf.cast(boxes, tf.int32)
                boxes = tf.bitwise.bitwise_and(boxes, 0xFFF)
                boxes = tf.cast(boxes, tf.float32)
            cinp = tf.concat([boxes, scores], axis=2)

            op = self.hsim.hnms(
                cinp,
                scores_threshold_quantized,
                l_desc.iou_threshold,
                l_desc.max_output_size,
                number_of_classes,
                False,
                self.is_mercury_arch,
            )
            op = tf.reshape(
                op,
                [l_desc.output_shape[0], number_of_classes // factor, l_desc.max_output_size * factor, 5],
            )
            op = tf.transpose(a=op, perm=[0, 1, 3, 2])
        else:
            inp = tf.identity(inp, name=NMS_FIRST_OP)
            inp = tf.transpose(a=inp, perm=[0, 1, 3, 2])
            classes = int(inp.shape[1])
            nms_out = []
            for i in range(factor):
                for class_ind in range(int(classes / factor)):
                    ind = class_ind + i * int(classes / factor)
                    inp_one_class = inp[:, ind : ind + 1, :, :]
                    # 8 cells into 4 cells (16 bytes concatenation)
                    jinp = inp_one_class
                    if inp_one_class.shape[3] == 8:
                        jinp = tf.map_fn(
                            _single_image_mat_mul_fn,
                            inp_one_class,
                            dtype=(tf.float32),
                            parallel_iterations=32,
                            back_prop=False,
                        )
                    # divide each 5X4 bbox table into 4 bbox rows
                    jinp_shapes = jinp.shape
                    y_min = tf.reshape(
                        jinp[:, :, 0::5, :],
                        [tf.shape(input=jinp)[0], jinp_shapes[2] // 5 * jinp_shapes[3], 1],
                    )
                    x_min = tf.reshape(
                        jinp[:, :, 1::5, :],
                        [tf.shape(input=jinp)[0], jinp_shapes[2] // 5 * jinp_shapes[3], 1],
                    )
                    y_max = tf.reshape(
                        jinp[:, :, 2::5, :],
                        [tf.shape(input=jinp)[0], jinp_shapes[2] // 5 * jinp_shapes[3], 1],
                    )
                    x_max = tf.reshape(
                        jinp[:, :, 3::5, :],
                        [tf.shape(input=jinp)[0], jinp_shapes[2] // 5 * jinp_shapes[3], 1],
                    )
                    scores = tf.reshape(
                        jinp[:, :, 4::5, :],
                        [tf.shape(input=jinp)[0], jinp_shapes[2] // 5 * jinp_shapes[3], 1],
                    )
                    cinp = tf.concat([y_min, x_min, y_max, x_max, scores], axis=2)
                    cinp = tf.expand_dims(cinp, -1)
                    boxes = tf.concat([y_min, x_min, y_max, x_max], 2)
                    scores = scores[:, :, 0]
                    nms_out_one_class = tf.map_fn(
                        _single_image_nms_fn,
                        (boxes, scores, cinp),
                        dtype=(tf.float32),
                        parallel_iterations=32,
                        back_prop=False,
                    )
                    if i == 0:
                        nms_out.append(nms_out_one_class)
                    else:
                        nms_out[class_ind] = tf.concat([nms_out[class_ind], nms_out_one_class], axis=1)
            nms_out = tf.transpose(a=tf.concat(nms_out, axis=3), perm=[0, 3, 2, 1])
            op = tf.identity(nms_out, name=NMS_LAST_OP)
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                op = tf.cast(op, tf.int32)

        if l_desc in self.mixed_numeric_layers:
            op = self._from_numeric(op)

        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [op])
        return op

    def _build_depth_to_space(self, l_desc, inp):
        if l_desc.block_sizes[0] == l_desc.block_sizes[1] and l_desc.depth_to_space_type == DepthToSpaceType.dcr:
            if l_desc.block_sizes[0] == 1:
                op = tf.identity(inp, "space_to_depth")
            else:
                op = tf.compat.v1.depth_to_space(inp, l_desc.block_sizes[0])
        else:
            n, h, w, c = inp.shape.as_list()
            out_h = int(h * l_desc.block_sizes[0])
            out_w = int(w * l_desc.block_sizes[1])
            out_c = int(c / (l_desc.block_sizes[0] * l_desc.block_sizes[1]))
            if l_desc.depth_to_space_type == DepthToSpaceType.dcr:
                op = tf.reshape(inp, (-1, h, w, l_desc.block_sizes[0], l_desc.block_sizes[1] * out_c))
                op = tf.transpose(a=op, perm=(0, 1, 3, 2, 4))
                op = tf.reshape(op, (-1, out_h, out_w, out_c))
            else:
                op = tf.reshape(inp, (-1, h, w, out_c, l_desc.block_sizes[0] * l_desc.block_sizes[1]))
                op = tf.transpose(a=op, perm=(0, 1, 2, 4, 3))
                op = tf.reshape(op, (-1, h, w, l_desc.block_sizes[0], out_c * l_desc.block_sizes[1]))
                op = tf.transpose(a=op, perm=(0, 1, 3, 2, 4))
                op = tf.reshape(op, (-1, out_h, out_w, out_c))

        if l_desc in self.mixed_numeric_layers:
            op = self._from_numeric(op)

        if l_desc.height_slice:
            op = op[:, l_desc.height_slice[0] : l_desc.height_slice[1] : l_desc.height_slice[2], :, :]
        if l_desc.width_slice:
            op = op[:, :, l_desc.width_slice[0] : l_desc.width_slice[1] : l_desc.width_slice[2], :]
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [op])
        return op

    def _build_space_to_depth(self, l_desc, inp):
        block_size_h = l_desc.block_sizes[0]
        block_size_w = l_desc.block_sizes[1]
        _, height, width, channels = inp.shape.as_list()
        out_h = int(height / block_size_h)
        out_w = int(width / block_size_w)
        out_c = int(channels * block_size_h * block_size_w)
        if l_desc.space_to_depth_type == SpaceToDepthType.serial:
            d2s_reshaped = tf.reshape(
                inp,
                (-1, out_h, block_size_w * out_w, block_size_h, channels),
                name="reshape_input",
            )
            d2s_splits = tf.split(d2s_reshaped, channels, axis=4, name="channels_splits")
            d2s_splits_stack = []
            for split in d2s_splits:
                split_slice = tf.reshape(
                    split[:, :, ::out_w, :, :],
                    (-1, out_h, 1, block_size_h * block_size_w),
                    name="reshape_slice",
                )
                for i in range(1, out_w):
                    split_slice = tf.concat(
                        (
                            split_slice,
                            tf.reshape(split[:, :, i::out_w, :, :], (-1, out_h, 1, block_size_h * block_size_w)),
                        ),
                        axis=2,
                    )
                d2s_splits_stack.append(split_slice)
            d2s_stack = tf.concat(d2s_splits_stack, axis=3, name="channels_stack")
            op = tf.reshape(d2s_stack, (-1, out_h, out_w, out_c))
        else:
            op = tf.reshape(inp, (-1, out_h, block_size_h, out_w, block_size_w, channels))

            if l_desc.space_to_depth_type == SpaceToDepthType.classic_dcr:
                op = tf.transpose(a=op, perm=(0, 1, 3, 2, 4, 5))
            elif l_desc.space_to_depth_type == SpaceToDepthType.classic_crd:
                op = tf.transpose(a=op, perm=(0, 1, 3, 5, 2, 4))
            elif l_desc.space_to_depth_type == SpaceToDepthType.focus:
                op = tf.transpose(a=op, perm=(0, 1, 3, 4, 2, 5))
            op = tf.reshape(op, (-1, out_h, out_w, out_c))
        if l_desc in self.mixed_numeric_layers:
            op = self._from_numeric(op)

        if l_desc.spatial_flatten_output:
            op = tf.reshape(tensor=op, shape=l_desc.output_shape)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [op])
        return op

    def _build_slice(self, l_desc, inp):
        rank = len(inp.get_shape())
        op = inp if rank != 2 else tf.reshape(inp, [-1, 1, 1, inp.shape[1]])
        op = tf.reshape(op, [-1, op.shape[1], op.shape[2], l_desc.groups, int(op.shape[3] / l_desc.groups)])
        op = op[
            :,
            l_desc.height_slice[0] : l_desc.height_slice[1],
            l_desc.width_slice[0] : l_desc.width_slice[1],
            :,
            l_desc.features_slice[0] : l_desc.features_slice[1],
        ]
        op = tf.reshape(op, [-1, op.shape[1], op.shape[2], op.shape[3] * op.shape[4]])

        if l_desc in self.mixed_numeric_layers:
            op = self._from_numeric(op)

        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [op])
        return op

    def _build_split_resize(
        self,
        inp,
        h_ratios,
        w_ratios,
        method,
        align_corners=True,
        half_pixels=False,
        inter_layer_precision_mode=None,
        is_16_bit_precision=False,
        is_int32_numeric=False,
    ):
        """
        Split the current resize node described in l_desc into a "chain" of nodes l_desc.h/w_ratios long.
        Each "sub node" will resize by the ratio at the index of l_desc.h/w_ratios, it's input will be
        the previous node in the chain.

        Example:
            input:
                * l_desc.h/w_ratios = [16, 2]
                * inp.shape == [1, 4, 2, 3]
            output:
                * numeric_bilinear1(input=inp, input_shape=[1, 4, 2, 3], output_shape=[1, 64, 32, 3])
                * numeric_bilinear2(input=numeric_bilinear1, input_shape=[1, 64, 32, 3], output_shape=[1, 128, 64, 3])
                  numeric_bilinear2 will be returned.

        Args:
            inp (tf.Tensor): A batch of one or more images to be resized.
                A 4D uint8 tensor of shape [batch_size, image_height, image_width, features]
            h_ratios (list): List of the resize ratios for the height dimension
            w_ratios (list): List of the resize ratios for the width dimension

        Returns:
            [tf.Tensor]: The "chain" of resize nodes

        """
        cur_height = inp.shape[1]
        cur_width = inp.shape[2]
        cur_op = inp
        k = None
        for h, w in zip(h_ratios, w_ratios):
            new_height = cur_height * h
            new_width = cur_width * w
            if method == ResizeMethod.nearest_neighbor:
                cur_op = tf.compat.v1.image.resize_nearest_neighbor(
                    cur_op,
                    [int(new_height), int(new_width)],
                    half_pixel_centers=half_pixels,
                    align_corners=align_corners,
                )
            else:
                resize_ratios_thresh = 0.005
                if (new_height - int(np.round(new_height)) > resize_ratios_thresh) or (
                    new_width - int(np.round(new_width)) > resize_ratios_thresh
                ):
                    raise BackendEmulatorException(
                        f"Resize requires integer dimensions. Received [{new_height}, {new_width}]",
                    )

                cur_op, k = self._build_quantized_resize_bilinear(
                    cur_op,
                    (int(np.round(new_height)), int(np.round(new_width))),
                    align_corners=align_corners,
                    half_pixels=half_pixels,
                    inter_layer_precision_mode=inter_layer_precision_mode,
                    is_16_bit_precision=is_16_bit_precision,
                    is_int32_numeric=is_int32_numeric,
                )

            cur_height = cur_op.shape[1]
            cur_width = cur_op.shape[2]

        return cur_op, k

    def _build_quantized_resize_bilinear(
        self,
        inp,
        size,
        shift=0,
        residual_width=4,
        align_corners=True,
        half_pixels=False,
        inter_layer_precision_mode=None,
        is_16_bit_precision=False,
        is_int32_numeric=False,
    ):
        """
        Quantized version of resize with bilinear interpolation, emulating the PPU's implementation.
        The function can enlarge or reduce the size of the image.
        Based on crop_and_resize_q in https://bitbucket.org/hailotech/ppu-numeric/src/master/bilinear.py (branch: master, hash: 060e0cf)

        Args:
            inp (tf.Tensor): A batch of one or more images to be resized.
                A 4D uint8 tensor of shape [batch_size, image_height, image_width, features]
            size (tf.Tensor): A 1-D int32 Tensor of 2 elements: new_height, new_width. The new size for the images.
            shift (int, optional): Defaults to 0. Right shift to the image coordinates, as is done in the PPU.
                The real coordinates are (x,y)*2**(-shift)
            residual_width (int, optional): Defaults to 4. The precision of the residual.

        Returns:
            [tf.Tensor]: The resized tensor

        """
        if inp.shape.ndims != 4:
            raise BackendEmulatorException(
                f"Requires a 4-D tensor. The input tensor's shape is {inp.shape} ({inp.shape.ndims}-D).",
            )
        if shift > residual_width:
            raise BackendEmulatorException(
                f"'shift' must be less than or equal to 'residual_width'. Received shift={shift} residual_width={residual_width}",
            )
        im_in_h, im_in_w, depth = (int(dim) for dim in inp.shape[1:])
        h_out, w_out = size
        h_scale, w_scale = h_out / im_in_h, w_out / im_in_w
        is_resize_lcu = not (align_corners and not inter_layer_precision_mode)
        bias = tf.Variable(tf.random.normal([inp.shape[-1]], stddev=0.35), name="bias", shape=tf.TensorShape(None))
        if self.target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC:
            bias_factor = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name="bias_factor")
            bias_feed_repeat = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name="bias_feed_repeat")
            bias = bias * bias_factor * bias_feed_repeat

        if is_resize_lcu:
            paddings = tf.constant([[0, 0], [1, 1], [1, 1], [0, 0]])
            inp = tf.pad(inp, paddings, "SYMMETRIC")

        # TODO: See SDK-7906
        # Our resize_bilinear uses crop_and_resize on the ppu, where the boxes are passed as the dimension of the
        # input image, one for each batch. The boxes on the ppu are represented with 12 bits, with a right shift.
        # In other words, a given coordinate x, will be interpreted as x * 2**-shift on the ppu.
        # The shift param is irrelevant for resize, only for crop and resize, so we set it to zero and can thus resize
        # images from up to 4096.
        PPU_BOX_MAX_SIZE = 2**12
        if max(im_in_h, im_in_w) * 2**shift > PPU_BOX_MAX_SIZE:
            raise BackendEmulatorException(
                f"The input image's dims * 2**shift are limted to {PPU_BOX_MAX_SIZE}. Received height={im_in_h}, width={im_in_w}, shift={shift}",
            )

        # Quantize the image's corner's
        # Note: We *must* run float64 ops to match the ppu-numeric implementation. Not specifing the dtype will result
        # in the ops being run in float32. This will cause diffs between the two implementations.
        # Creating y1 and x1 as float64 will make all of the ops float64.
        layer_name = "/".join(inp.name.split("/")[:-1])
        lcu_quantized = is_resize_lcu and layer_name in self.activation_points
        w_factor = 2 * (w_out - 1) if align_corners and w_out > 1 else 2 * w_out
        h_factor = 2 * (h_out - 1) if align_corners and h_out > 1 else 2 * h_out
        if is_resize_lcu:
            kernel_shape = (h_out, w_out, 4, 1)
            if half_pixels:
                y1, x1 = (0.5 + 0.5 / h_scale) * h_factor * 2**shift, (0.5 + 0.5 / w_scale) * w_factor * 2**shift
                y2, x2 = (
                    (im_in_h + 0.5 - 0.5 / h_scale) * h_factor * 2**shift,
                    (im_in_w + 0.5 - 0.5 / w_scale) * w_factor * 2**shift,
                )
            elif align_corners:
                y1, x1 = h_factor * 2**shift, w_factor * 2**shift
                y2, x2 = im_in_h * h_factor * 2**shift, im_in_w * w_factor * 2**shift
            else:
                y1, x1 = h_factor * 2**shift, w_factor * 2**shift
                y2, x2 = (
                    (im_in_h + 1 - (1 / h_scale)) * h_factor * 2**shift,
                    (im_in_w + 1 - (1 / w_scale)) * w_factor * 2**shift,
                )

            k = tf.Variable(tf.random.normal(kernel_shape, stddev=0.35), name="kernel")
            if is_int32_numeric:
                k = tf.cast(k, tf.int32)
            one_m_rx_one_m_ry, rx_one_m_ry, ry_one_m_rx, rx_ry = (
                k[:, :, 0, :],
                k[:, :, 1, :],
                k[:, :, 2, :],
                k[:, :, 3, :],
            )
        else:
            y1, x1 = tf.constant(0, dtype=tf.float64), tf.constant(0, dtype=tf.float64)
            y2, x2 = (im_in_h - 1) * 2**shift, (im_in_w - 1) * 2**shift
            k = None
        # Generate input image grid
        if is_resize_lcu:
            if not half_pixels:
                x_in_corner, y_in_corner = 0, 0
            else:
                if int(w_scale) == w_scale:
                    x_in_corner = int(w_scale / 2)
                elif (x2 - x1) != 0:
                    x_in_corner = math.ceil((w_factor - x1) / (w_factor / w_scale))
                else:
                    x_in_corner = 0
                if int(h_scale) == h_scale:
                    y_in_corner = int(h_scale / 2)
                elif (y2 - y1) != 0:
                    y_in_corner = math.ceil((h_factor - y1) / (h_factor / h_scale))
                else:
                    y_in_corner = 0
            x_range = np.linspace(x1 / (2**shift), x2 / (2**shift), w_out)
            y_range = np.linspace(y1 / (2**shift), y2 / (2**shift), h_out)
        else:
            y_range = tf.linspace(y1 / (2**shift), y2 / (2**shift), h_out)
            x_range = tf.linspace(x1 / (2**shift), x2 / (2**shift), w_out)

        if not lcu_quantized:
            if is_resize_lcu:
                y_range = old_div(
                    np.round(old_div(np.round(y_range * 2 ** (residual_width * 4)), 2 ** (3 * residual_width))),
                    2**residual_width,
                )
                x_range = old_div(
                    np.round(old_div(np.round(x_range * 2 ** (residual_width * 4)), 2 ** (3 * residual_width))),
                    2**residual_width,
                )
            else:
                y_range = old_div(
                    tf.round(old_div(tf.round(y_range * 2 ** (residual_width * 4)), 2 ** (3 * residual_width))),
                    2**residual_width,
                )
                x_range = old_div(
                    tf.round(old_div(tf.round(x_range * 2 ** (residual_width * 4)), 2 ** (3 * residual_width))),
                    2**residual_width,
                )
        if is_resize_lcu:
            xx_in, yy_in = np.meshgrid(x_range, y_range)
        else:
            xx_in, yy_in = tf.meshgrid(x_range, y_range)
        # Calculate coordinates and residuals for bilinear
        if is_resize_lcu:
            xx_in_floor = xx_in // w_factor
            if w_scale % 2 != 0 and w_scale == int(w_scale) and half_pixels:
                for i in range(x_in_corner, xx_in_floor[0].size, int(w_scale)):
                    if xx_in_floor[0, i] == xx_in_floor[0, i - 1]:
                        if i == xx_in_floor[0].size - 1:
                            xx_in_floor[:, i] = 0
                        else:
                            xx_in_floor[:, i] = xx_in_floor[:, i + 1]
            yy_in_floor = yy_in // h_factor
            if h_scale % 2 != 0 and h_scale == int(h_scale) and half_pixels:
                for i in range(y_in_corner, yy_in_floor[:, 0].size, int(h_scale)):
                    if yy_in_floor[i, 0] == yy_in_floor[i - 1, 0]:
                        if i == yy_in_floor[:, 0].size - 1:
                            yy_in_floor[i, :] = 0
                        else:
                            yy_in_floor[i, :] = yy_in_floor[i + 1, :]
            xx_in /= w_factor
            yy_in /= h_factor
        else:
            xx_in_floor = tf.floor(xx_in)
            yy_in_floor = tf.floor(yy_in)
        xx_in_ceil = tf.math.ceil(xx_in)
        yy_in_ceil = tf.math.ceil(yy_in)
        if not lcu_quantized:
            # Resize is calculated in ppu
            r_x = tf.cast(
                tf.reshape((xx_in - xx_in_floor), (h_out, w_out, 1)),
                tf.float32,
            )  # The reshape is done to match the shape of xx_in_floor
            r_y = tf.cast(
                tf.reshape((yy_in - yy_in_floor), (h_out, w_out, 1)),
                tf.float32,
            )  # The reshape is done to match the shape of yy_in_floor
        # Calculate values in the horizontal axis (in the HW this calculation is losses)
        # * Create the tensor of the upper left, upper right, bottom left and bottom right indices in the
        #   input image, that match each pixel in the output image
        # * Create a tensor of the upper left etc. pixels in the shape of the output image
        # Note that we flatten the index and image tensor to use the tf.gather op. Tensor flow doesn't support
        # smart indexing that is used ppu-numeric implementation.
        im_in_h, im_in_w, depth = (int(dim) for dim in inp.shape[1:])
        inp_flat = tf.reshape(inp, [-1, im_in_h * im_in_w, depth])
        upper_left_indices_flat = tf.reshape(tf.cast((yy_in_floor * im_in_w) + xx_in_floor, tf.int32), [-1])
        upper_right_indices_flat = tf.reshape(tf.cast((yy_in_floor * im_in_w) + xx_in_ceil, tf.int32), [-1])
        bottom_left_indices_flat = tf.reshape(tf.cast((yy_in_ceil * im_in_w) + xx_in_floor, tf.int32), [-1])
        bottom_right_indices_flat = tf.reshape(tf.cast((yy_in_ceil * im_in_w) + xx_in_ceil, tf.int32), [-1])
        im_out_upper_left = tf.reshape(tf.gather(inp_flat, upper_left_indices_flat, axis=1), [-1, h_out, w_out, depth])
        im_out_upper_right = tf.reshape(
            tf.gather(inp_flat, upper_right_indices_flat, axis=1),
            [-1, h_out, w_out, depth],
        )
        im_out_bottom_left = tf.reshape(
            tf.gather(inp_flat, bottom_left_indices_flat, axis=1),
            [-1, h_out, w_out, depth],
        )
        im_out_bottom_right = tf.reshape(
            tf.gather(inp_flat, bottom_right_indices_flat, axis=1),
            [-1, h_out, w_out, depth],
        )
        if lcu_quantized:
            accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, False)
            one_m_rx_one_m_ry = tf.repeat(one_m_rx_one_m_ry, depth, axis=2)
            rx_one_m_ry = tf.repeat(rx_one_m_ry, depth, axis=2)
            ry_one_m_rx = tf.repeat(ry_one_m_rx, depth, axis=2)
            rx_ry = tf.repeat(rx_ry, depth, axis=2)
            float_accumulator = self.target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC
            im_out_full = self.hsim.h_bilinear(
                im_out_upper_left,
                im_out_upper_right,
                im_out_bottom_left,
                im_out_bottom_right,
                one_m_rx_one_m_ry,
                rx_one_m_ry,
                ry_one_m_rx,
                rx_ry,
                self.emulation_mult_shift,
                bias,
                accumulator_size=accumulator_size,
                float_accumulator=float_accumulator,
            )
        else:
            im_out_floor = (1 - r_x) * tf.cast(im_out_upper_left, tf.float32) + r_x * tf.cast(
                im_out_upper_right,
                tf.float32,
            )
            im_out_ceil = (1 - r_x) * tf.cast(im_out_bottom_left, tf.float32) + r_x * tf.cast(
                im_out_bottom_right,
                tf.float32,
            )
            # Calculate final value using the vertical residual
            im_out_full = (1 - r_y) * im_out_floor + r_y * im_out_ceil

        # Quantization
        if not is_resize_lcu:
            im_out_round = tf.round(im_out_full)
        elif lcu_quantized:
            im_out_round = im_out_full
        else:
            im_out_round = tf.round(tf.round(im_out_full * 8) / 8)

        # Assign to output
        op = im_out_round
        return op, k

    def _build_max_pool(self, l_desc, inp):
        return self._build_pool(l_desc, inp, tf_callback=tf.nn.max_pool2d)

    def _build_global_avg_pool(self, l_desc, inp):
        pool = tf.reduce_mean(input_tensor=inp, axis=[1, 2], name="op", keepdims=True)
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            pool = tf.round(pool)
        if l_desc in self.mixed_numeric_layers:
            pool = self._from_numeric(pool)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [pool])
        return pool

    def _build_shortcut(self, l_desc, inp):
        shortcut = tf.identity(inp, name="shortcut")
        if l_desc in self.mixed_numeric_layers:
            shortcut = self._from_numeric(shortcut)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [shortcut])
        return shortcut

    def _build_pad(self, l_desc, inp, qp_in):
        padding = [[0, 0], [l_desc._top, l_desc._bottom], [l_desc._left, l_desc._right], [l_desc._front, l_desc._back]]
        if self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
        ]:
            is_16bit_precision_mode = l_desc.precision_config.precision_mode in [
                PrecisionMode.a16_w16,
                PrecisionMode.a16_w16_a16,
            ]
            pad_val = self._get_const("padding_const_value", qp_in[0])
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                pad_val = tf.cast(pad_val, tf.int32)
            pad = tf.pad(tensor=inp, paddings=padding, mode="CONSTANT", constant_values=pad_val)

        else:
            pad = tf.pad(tensor=inp, paddings=padding, mode="CONSTANT")

        if l_desc in self.mixed_numeric_layers:
            pad = self._from_numeric(pad)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [pad])
        return pad

    def _build_format_conversion(self, l_desc, inp):
        def spatial_reshape(data, spatial_shape):
            output_size = spatial_shape[0] * spatial_shape[1]
            input_size = data.shape[1] * data.shape[2]
            features = data.shape[-1]

            data = tf.reshape(data, [-1, input_size, data.shape[3]])
            if output_size > input_size:
                data = tf.pad(data, [[0, 0], [0, output_size - input_size], [0, 0]])
            else:
                data = data[:, :output_size, :]

            output_shape = [-1, spatial_shape[0], spatial_shape[1], features]
            return tf.reshape(data, output_shape)

        if l_desc.conversion_type in [
            FormatConversionType.mipi_bayer_rggb_to_hailo_rgb,
            FormatConversionType.mipi_bayer_bggr_to_hailo_rgb,
            FormatConversionType.mipi_bayer_grbg_to_hailo_rgb,
            FormatConversionType.mipi_bayer_gbrg_to_hailo_rgb,
        ]:
            if l_desc.conversion_type == FormatConversionType.mipi_bayer_rggb_to_hailo_rgb:
                r = inp[:, ::2, ::2, :]
                g0 = inp[:, ::2, 1::2, :]
                g1 = inp[:, 1::2, ::2, :]
                b = inp[:, 1::2, 1::2, :]
            elif l_desc.conversion_type == FormatConversionType.mipi_bayer_bggr_to_hailo_rgb:
                b = inp[:, ::2, ::2, :]
                g0 = inp[:, ::2, 1::2, :]
                g1 = inp[:, 1::2, ::2, :]
                r = inp[:, 1::2, 1::2, :]
            elif l_desc.conversion_type == FormatConversionType.mipi_bayer_grbg_to_hailo_rgb:
                g0 = inp[:, ::2, ::2, :]
                r = inp[:, ::2, 1::2, :]
                b = inp[:, 1::2, ::2, :]
                g1 = inp[:, 1::2, 1::2, :]
            elif l_desc.conversion_type == FormatConversionType.mipi_bayer_gbrg_to_hailo_rgb:
                g0 = inp[:, ::2, ::2, :]
                b = inp[:, ::2, 1::2, :]
                r = inp[:, 1::2, ::2, :]
                g1 = inp[:, 1::2, 1::2, :]

            height = l_desc.input_height
            width = l_desc.input_width
            r = tf.compat.v1.image.resize_nearest_neighbor(r, [height, width])
            b = tf.compat.v1.image.resize_nearest_neighbor(b, [height, width])
            g0 = tf.compat.v1.image.resize_nearest_neighbor(g0, [int(height / 2), width])
            g1 = tf.compat.v1.image.resize_nearest_neighbor(g1, [int(height / 2), width])
            g = tf.reshape(tf.concat([g0, g1], 2), [-1, height, width, 1])
            conversion = tf.concat([r, g, b], 3, name="format_conversion")

        elif l_desc.conversion_type == FormatConversionType.twelve_to_eight_bit:
            conversion = tf.floor(inp[:, :, ::2, :] / 16) + (inp[:, :, 1::2, :] % 16) * 16
        elif l_desc.conversion_type == FormatConversionType.twelve_to_sixteen_bit:
            inp_int = tf.cast(inp, tf.uint16)

            # Calculate the width in terms of 16-bit blocks as we are working with 12-bit values packed in 16 bits
            width = l_desc.input_width
            width_16bits = (width * 2) // 3
            features = l_desc.input_features
            height = l_desc.input_height

            inp_int = tf.reshape(inp_int, [-1, height, width // 3, 3, features])

            # Extract the three bytes used to store the two 12-bit numbers
            byte1 = inp_int[:, :, :, 0, :]
            byte2 = inp_int[:, :, :, 1, :]
            byte3 = inp_int[:, :, :, 2, :]

            # Reconstruct the first 12-bit number using TensorFlow bitwise operations
            first_12bit = tf.bitwise.bitwise_or(
                tf.bitwise.bitwise_and(byte1, 0xFF), tf.bitwise.left_shift(tf.bitwise.bitwise_and(byte2, 0x0F), 8)
            )  # (b, h, 8, 1, f)

            # Reconstruct the second 12-bit number using TensorFlow bitwise operations
            second_12bit = tf.bitwise.bitwise_or(
                tf.bitwise.right_shift(tf.bitwise.bitwise_and(byte2, 0xF0), 4),
                tf.bitwise.left_shift(tf.bitwise.bitwise_and(byte3, 0xFF), 4),
            )

            conversion = tf.concat([first_12bit, second_12bit], 3, name="op")
            conversion = tf.reshape(conversion, [-1, height, width_16bits, features])
        elif l_desc.conversion_type == FormatConversionType.sixteen_to_twelve_bit:
            width = l_desc.input_width
            width_8bits = (width * 3) // 2
            features = l_desc.input_features
            height = l_desc.input_height

            inp_int = tf.cast(inp, tf.uint64)
            inp_int = tf.reshape(inp_int, [-1, height, width // 4, 4, features])

            pixel1 = tf.bitwise.bitwise_and(inp_int[:, :, :, 0, :], 0xFFF)
            pixel2 = tf.bitwise.bitwise_and(inp_int[:, :, :, 1, :], 0xFFF)
            pixel3 = tf.bitwise.bitwise_and(inp_int[:, :, :, 2, :], 0xFFF)
            pixel4 = tf.bitwise.bitwise_and(inp_int[:, :, :, 3, :], 0xFFF)

            byte1 = tf.bitwise.bitwise_and(pixel1, 0xFF)
            byte2 = tf.bitwise.bitwise_or(
                tf.bitwise.right_shift(tf.bitwise.bitwise_and(pixel1, 0xF00), 8),
                tf.bitwise.left_shift(tf.bitwise.bitwise_and(pixel2, 0xF), 4),
            )
            byte3 = tf.bitwise.right_shift(tf.bitwise.bitwise_and(pixel2, 0xFF0), 4)
            byte4 = tf.bitwise.bitwise_and(pixel3, 0xFF)
            byte5 = tf.bitwise.bitwise_or(
                tf.bitwise.right_shift(tf.bitwise.bitwise_and(pixel3, 0xF00), 8),
                tf.bitwise.left_shift(tf.bitwise.bitwise_and(pixel4, 0xF), 4),
            )
            byte6 = tf.bitwise.right_shift(tf.bitwise.bitwise_and(pixel4, 0xFF0), 4)

            conversion = tf.concat([byte1, byte2, byte3, byte4, byte5, byte6], 3, name="op")
            conversion = tf.reshape(conversion, [-1, height, width_8bits, features])

        elif l_desc.conversion_type in (
            FormatConversionType.features_to_width_features,
            FormatConversionType.flat_to_frames,
            FormatConversionType.frames_to_flat,
        ):
            conversion = tf.reshape(inp, l_desc.output_shape)
        elif l_desc.conversion_type == FormatConversionType.transpose_width_features:
            conversion = tf.reshape(
                inp,
                [
                    l_desc.input_shape[0],
                    l_desc.input_shape[1],
                    l_desc.input_shape[2],
                    l_desc.groups,
                    int(l_desc.input_shape[3] / l_desc.groups),
                ],
            )
            conversion = tf.transpose(conversion, perm=[0, 1, 4, 3, 2])
            conversion = tf.reshape(conversion, l_desc.output_shape)
        elif l_desc.conversion_type == FormatConversionType.transpose_height_width:
            if len(inp.shape) == 4:
                conversion = tf.transpose(inp, perm=[0, 2, 1, 3])
            else:
                conversion = tf.transpose(inp, perm=[0, 2, 1])
            conversion = tf.reshape(conversion, l_desc.output_shape)
        elif l_desc.conversion_type == FormatConversionType.spatial_reshape:
            inp = reshape_input_by_windows(inp, l_desc.input_windows)
            inp = spatial_reshape(inp, l_desc.spatial_reshape_sizes)
            conversion = reshape_output_by_windows(inp, l_desc.output_windows)
        elif l_desc.conversion_type == FormatConversionType.tf_rgbx_to_hailo_rgb:
            conversion = inp[:, :, :, :-1]
        elif l_desc.conversion_type == FormatConversionType.yuy2_to_hailo_yuv:
            output_shape = list(inp.shape)
            output_shape[-1] = 3
            conversion = yuy2_to_yuv_conversion(inp, output_shape)
        elif l_desc.conversion_type == FormatConversionType.nv12_to_hailo_yuv:
            output_shape = list(inp.shape)
            output_shape[1] *= 2
            conversion = nv12_to_yuv_conversion(inp, output_shape)
        elif l_desc.conversion_type == FormatConversionType.reshape_height_features:
            inp = reshape_input_by_windows(inp, l_desc.input_windows)

            inp = tf.transpose(inp, perm=[0, 1, 3, 2])
            shape = [l_desc.spatial_reshape_sizes[0], l_desc.spatial_reshape_sizes[2]]
            inp = spatial_reshape(inp, shape)
            conversion = tf.transpose(inp, perm=[0, 1, 3, 2])

            conversion = reshape_output_by_windows(conversion, l_desc.output_windows)
        else:
            conversion = tf.identity(inp, name="format_conversion")
        if l_desc in self.mixed_numeric_layers:
            conversion = self._from_numeric(conversion)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [conversion])
        return conversion

    def _build_argmax(self, l_desc, inp):
        axis = 1 if len(inp.shape) == 2 else 3
        if l_desc.reverse_order:
            inp = tf.reverse(tensor=inp, axis=[axis])
        argmax = tf.cast(tf.argmax(input=inp, axis=axis, name="argmax", output_type=tf.int32), tf.float32)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [argmax])
        return argmax

    def _build_reduce_max(self, l_desc, inp):
        if l_desc.groups > 1:
            concat_inputs = []
            input_group_size = int(l_desc.input_shape[3] / l_desc.groups)

            for g in range(l_desc.groups):
                group_input = inp[:, :, :, g * input_group_size : (g + 1) * input_group_size]
                reduce_max = tf.reduce_max(input_tensor=group_input, axis=3, name=f"reduce_max_{g}")
                reduce_max = tf.reshape(reduce_max, [-1, reduce_max.shape[1], reduce_max.shape[2], 1])
                concat_inputs.append(reduce_max)
            reduce_max = tf.concat(concat_inputs, 3, name="op")
        else:
            reduce_max = tf.reduce_max(input_tensor=inp, axis=l_desc.reduce_axes, name="reduce_max", keepdims=True)
            if l_desc in self.mixed_numeric_layers:
                reduce_max = self._from_numeric(reduce_max)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [reduce_max])
        return reduce_max

    def _build_feature_shuffle(self, l_desc, inp):
        num_groups = l_desc.groups
        groups_slice = l_desc.groups_slice
        x = tf.identity(inp, name="feature_shuffle")
        b, h, w, f = x.shape.as_list()
        group_size = f // num_groups
        if groups_slice:
            tensor_reshaped = tf.reshape(x, [b if b is not None else -1, h, w, num_groups, group_size])
            start, end, step = groups_slice
            A_start, A_end = 0, start
            B_start, B_end = start, end
            C_start, C_end = end, group_size

            A = tensor_reshaped[..., A_start:A_end]
            B = tensor_reshaped[..., B_start:B_end]
            C = tensor_reshaped[..., C_start:C_end]

            reordered_tensor = tf.concat([B, A, C], axis=-1)
            feature_shuffle = tf.reshape(reordered_tensor, [b if b is not None else -1, h, w, f])

        else:
            x_reshaped = tf.reshape(x, [-1, h, w, num_groups, group_size])
            x_transposed = tf.transpose(a=x_reshaped, perm=[0, 1, 2, 4, 3])
            feature_shuffle = tf.reshape(x_transposed, [-1, h, w, f])
        if l_desc in self.mixed_numeric_layers:
            feature_shuffle = self._from_numeric(feature_shuffle)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [feature_shuffle])
        return feature_shuffle

    def _partial_softmax(self, inputs, input_scales):
        # simulate partial softmax
        max_input = tf.reduce_max(inputs, axis=-1, keepdims=True)
        inputs_norm = inputs - max_input
        exp = tf.math.exp(inputs_norm * input_scales)
        exp_sum = tf.reduce_sum(exp, keepdims=True, axis=-1)
        res_softmax_native = tf.math.divide(exp, exp_sum)
        return res_softmax_native * 255

    def _build_softmax(self, l_desc, inp, qp_in):
        if len(inp.shape) != 4:
            ax = np.argmax(inp.shape[1:]) + 1
            inp = tf.reshape(inp, [-1, inp.shape[ax]])
        is_16bit_precision_mode = self._is_16bit_input(l_desc)
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            lut_table = tf.Variable(
                tf.zeros([512], dtype=tf.uint32),
                dtype=tf.uint32,
                trainable=False,
                name="softmax_lut",
            )

            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                inp = tf.cast(inp, tf.float32)
            if len(inp.shape) == 4:
                concat_softmaxs = []
                for h in range(inp.shape[1]):
                    row = inp[:, h, :, :]

                    row_softmax = tf.reshape(
                        self.hsim.h_softmax(row, lut_table, self.is_mercury_arch),
                        [-1, 1, 1, inp.shape[-1]],
                    )
                    concat_softmaxs.append(row_softmax)
                softmax = tf.concat(concat_softmaxs, 1, name="softmax")
            else:
                softmax = self.hsim.h_softmax(inp, lut_table=lut_table, is_mercury=self.is_mercury_arch)
        elif self.target in [EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
            softmax = self._partial_softmax(inp, qp_in[1])
        else:
            softmax = tf.nn.softmax(inp, axis=-1)

        if l_desc in self.mixed_numeric_layers:
            softmax = self._from_numeric(softmax)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [softmax])
        if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
            softmax = tf.cast(softmax, tf.int32)
        return softmax

    def _build_layernorm(self, l_desc, inp, qp_in):
        if self.target not in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
        ]:
            assert 0

        is_16bit_precision_mode = self._is_16bit_input(l_desc)

        is_16bit_output = l_desc.precision_config.precision_mode in [
            PrecisionMode.a16_w16,
            PrecisionMode.a16_w16_a16,
        ]

        zero_point = tf.Variable(0, trainable=False, name="zero_point_in", dtype=tf.uint32)
        exponent_sub_value = tf.Variable(0, trainable=False, name="shift_to_plus", dtype=tf.uint32)
        sum_of_x_mult = tf.Variable(0, trainable=False, name="mult_mu", dtype=tf.uint32)
        sum_of_x_shift = tf.Variable(0, trainable=False, name="shift_x_sum", dtype=tf.uint32)
        sum_of_x_mult_n_shift = tf.Variable(0, trainable=False, name="shift_mu", dtype=tf.uint32)
        x_squared_mult = tf.Variable(0, trainable=False, name="vector_size", dtype=tf.uint32)
        sum_of_x_squares_shift = tf.Variable(0, trainable=False, name="shift_x2_sum", dtype=tf.uint32)
        epsilon = tf.Variable(0, trainable=False, name="epsilon_quant", dtype=tf.uint32)

        mul_act_man = tf.cast(
            tf.Variable(tf.zeros([1, 3]), trainable=False, name="output_stage/piecewise/slopes_m", dtype=tf.float32),
            tf.uint32,
        )[0, 1]
        mul_act_exp = tf.cast(
            tf.Variable(tf.zeros([1, 3]), trainable=False, name="output_stage/piecewise/slopes_e", dtype=tf.float32),
            tf.uint32,
        )[0, 1]
        mul_act_bias = tf.cast(
            tf.Variable(tf.zeros([1, 3]), trainable=False, name="output_stage/piecewise/offsets", dtype=tf.float32),
            tf.uint32,
        )[0, 1]
        rms_mode_enable = tf.Variable(0, trainable=False, name="rms_mode_enable", dtype=tf.uint32)
        activation_ebias_mode = tf.Variable(0, trainable=False, name="activation_ebias_mode", dtype=tf.uint32)
        lut = tf.cast(tf.Variable(tf.zeros([256]), trainable=False, name="lut_table", dtype=tf.float32), tf.float32)

        if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
            inp = tf.cast(inp, tf.float32)
        layernorm = self.hsim.h_layernorm(
            inp,
            zero_point=zero_point,
            exponent_sub_value=exponent_sub_value,
            sum_of_x_mult=sum_of_x_mult,
            sum_of_x_shift=sum_of_x_shift,
            sum_of_x_mult_n_shift=sum_of_x_mult_n_shift,
            x_squared_mult=x_squared_mult,
            sum_of_x_squares_shift=sum_of_x_squares_shift,
            epsilon=epsilon,
            mul_act_man=mul_act_man,
            mul_act_exp=mul_act_exp,
            mul_act_bias=mul_act_bias,
            rms_mode_enable=rms_mode_enable,
            activation_ebias_mode=activation_ebias_mode,
            activation_output_mode=tf.ones([1], tf.uint32) * 1 if is_16bit_output else 0,
            lut=lut,
        )

        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [layernorm])
        if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
            layernorm = tf.cast(layernorm, tf.int32)
        return layernorm

    def _build_feature_splitter(self, l_desc, inp):
        def collect_splits_to_group(split_index, inp, feature_axis):
            return tf.concat([group[split_index] for group in inp], axis=feature_axis)

        size_splits = [output_shape[-1] // l_desc.groups for output_shape in l_desc.output_shapes]
        feature_axis = 1 if len(inp.shape) == 2 else 3

        # split by amount of groups (20/2->10,10)
        grouped_inp = tf.split(inp, l_desc.groups, axis=feature_axis)
        # split groups by feature ratio given 10,10->[3,7], [3,7]
        split_grouped_inp = list(map(lambda x: tf.split(x, size_splits, axis=feature_axis), grouped_inp))
        # collect splits to form groups [3,7], [3,7]->[6,14]
        feature_splitter = [
            collect_splits_to_group(i, split_grouped_inp, feature_axis) for i in range(len(l_desc.output_shapes))
        ]

        if l_desc in self.mixed_numeric_layers:
            feature_splitter = self._from_numeric(feature_splitter)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, feature_splitter)
        return feature_splitter

    def _build_row_splitter(self, l_desc, inp):
        row_splitter = []
        splits_num = len(l_desc.output_shapes)
        if l_desc.sequential_row_split:
            row_offset_start = 0
            row_offset_end = 0
            for ind in range(splits_num):
                row_offset_end = row_offset_start + l_desc.output_shapes[ind][1]
                row_splitter.append(inp[:, row_offset_start:row_offset_end, :, :])
                row_offset_start = row_offset_end
        else:
            # round-robin split
            for ind in range(splits_num):
                row_splitter.append(inp[:, ind::splits_num, :, :])
        if l_desc in self.mixed_numeric_layers:
            row_splitter = self._from_numeric(row_splitter)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, row_splitter)
        return row_splitter

    def _build_width_splitter(self, l_desc, inp):
        width_splitter = []
        splits_num = len(l_desc.output_shapes)
        spatial_offset_start = 0
        spatial_offset_end = 0
        for ind in range(splits_num):
            spatial_offset_end = spatial_offset_start + l_desc.output_shapes[ind][2]
            width_splitter.append(inp[:, :, spatial_offset_start:spatial_offset_end, :])
            spatial_offset_start = spatial_offset_end
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, width_splitter)
        return width_splitter

    def _build_precision_splitter(self, l_desc, inp):
        low_input = tf.cast(tf.bitwise.bitwise_and(inp, 0xFF), dtype=tf.float32)
        high_input = tf.cast(tf.bitwise.bitwise_and(tf.bitwise.right_shift(inp, 8), 0xFF), dtype=tf.float32)
        if l_desc.precision_split_mode == PrecisionSplitMode.NORMAL:
            precision_splitter = [low_input, high_input]
        elif l_desc.precision_split_mode == PrecisionSplitMode.PIXELS:
            precision_splitter = tf.concat((low_input, high_input), axis=2)
            precision_splitter = tf.reshape(precision_splitter, [-1, inp.shape[1], 2, inp.shape[2], inp.shape[3]])
            precision_splitter = tf.transpose(precision_splitter, (0, 1, 3, 2, 4))
            precision_splitter = [tf.reshape(precision_splitter, [-1, inp.shape[1], inp.shape[2] * 2, inp.shape[3]])]
        else:
            raise BackendNotImplementedError("PrecisonSplitterMode is not supported")

        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, precision_splitter)
        return precision_splitter

    def _build_matmul(self, l_desc, inputs, inter_layer_precision_mode):
        is_transposed = l_desc.transpose_matmul_input  # first implementation is transposed
        data_mat = self.prepare_matmul_data_input(inputs[0], l_desc.groups, l_desc.input_windows)
        weights_mat = self.prepare_matmul_weights_input(
            inputs[1], l_desc.groups, is_transposed, l_desc.input_windows, l_desc.input_tiles
        )
        if l_desc.zp_comp_rank > 0 and self.target != EmulationInferenceTargets.SDK_NUMERIC:
            # In the case of zp compensation, we need to ignore the additional feature.
            # Transposed input is not supported with zp compestation
            weights_mat = weights_mat[:, :-1, :] if is_transposed else weights_mat[:, :, :-1]

        if self.target in {EmulationInferenceTargets.SDK_FP_OPTIMIZED, EmulationInferenceTargets.SDK_NATIVE}:
            matmul = self._build_matmul_native(data_mat, weights_mat)

        elif self.target in {
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
        }:
            zp_in_data = tf.Variable(
                tf.random.normal([1]), dtype=tf.float32, name="zero_point_in", shape=tf.TensorShape(None)
            )
            zp_in_data = self._get_const("zp_x_vals", zp_in_data)
            # we want to make it a vector also if given as a number
            zp_in_data = tf.reshape(zp_in_data, [-1])

            if self.target in {EmulationInferenceTargets.SDK_PARTIAL_NUMERIC}:
                matmul = self._build_matmul_partial_numeric(data_mat, zp_in_data, weights_mat)
            else:
                matmul = self._build_matmul_numeric(
                    data_mat,
                    zp_in_data,
                    weights_mat,
                    inter_layer_precision_mode,
                    l_desc.zp_comp_rank,
                )

        # reshape outputs to [batch, H, W, HxW] (mostly for easier flow)
        self._kernel_variables[l_desc.name] = weights_mat
        matmul_output = self.prepare_matmul_output(
            l_desc.input_shapes[0][1:3], matmul, l_desc.groups, l_desc.input_windows
        )
        return matmul_output, weights_mat

    def prepare_matmul_data_input(self, inp, groups, input_windows):
        inp = reshape_input_by_windows(inp, input_windows)

        if groups == 1:
            # reshape inputs to [batch, HxW, features]
            return tf.reshape(inp, [-1, inp.shape[1] * inp.shape[2], inp.shape[3]])
        # for grouped matmul, we create a "depth to space" block, so that the op will be per group
        data_inp = tf.reshape(inp, [-1, inp.shape[1], inp.shape[2], groups, inp.shape[3] // groups])
        data_inp = tf.transpose(data_inp, (0, 3, 1, 2, 4))
        return tf.reshape(data_inp, [-1, data_inp.shape[2] * data_inp.shape[3], data_inp.shape[4]])

    def prepare_matmul_weights_input(self, inp, groups, is_transposed, input_windows, input_tiles):
        inp = reshape_input_by_windows(inp, input_windows)
        weight_feature_tiles = input_tiles[-1][-1]
        weights = inp

        if groups == 1:
            weights = tf.reshape(weights, [-1, weights.shape[1] * weights.shape[2], weights.shape[3]])
            return tf.transpose(weights, perm=[0, 2, 1]) if is_transposed else weights

        if weight_feature_tiles > 1:
            pre_tile_groups = groups // weight_feature_tiles
            # split into the unique groups that'll be tiled
            weights = tf.reshape(
                weights, [-1, weights.shape[1], weights.shape[2], pre_tile_groups, weights.shape[3] // pre_tile_groups]
            )
            weights = tf.tile(weights, [1] * (len(weights.shape) - 1) + [weight_feature_tiles])
            # reinserting the tiled features
            weights = tf.reshape(weights, [-1, weights.shape[1], weights.shape[2], weights.shape[-1] * pre_tile_groups])

        # split into the actual groups
        weights = tf.reshape(weights, [-1, weights.shape[1], weights.shape[2], groups, weights.shape[-1] // groups])
        # fold groups into batches
        weights = tf.transpose(weights, (0, 3, 1, 2, 4))
        weights = tf.reshape(weights, [-1, weights.shape[2] * weights.shape[3], weights.shape[4]])
        if is_transposed:
            weights = tf.transpose(weights, (0, 2, 1))
        return weights

    def prepare_matmul_output(self, orig_shape, matmul, groups, output_windows):
        window_size = [orig_shape[0] // output_windows[0], orig_shape[1] // output_windows[1]]
        matmul_output = tf.reshape(matmul, [-1, groups, matmul.shape[1], matmul.shape[2]])
        matmul_output = tf.transpose(matmul_output, [0, 2, 1, 3])
        matmul_output = tf.reshape(matmul_output, [-1, window_size[0], window_size[1], matmul_output.shape[3] * groups])

        matmul_output = reshape_output_by_windows(matmul_output, output_windows)
        return matmul_output

    def _build_matmul_native(self, data_input, weights_input):
        return tf.matmul(data_input, weights_input, transpose_b=False, name="matmul")

    def _build_matmul_partial_numeric(self, data_input, zp_in_data, weights_input):
        data_in = data_input - zp_in_data[0]
        return tf.matmul(data_in, weights_input, transpose_b=False, name="matmul")

    def _build_matmul_numeric(self, data_input, zp_in_data, weights_input, inter_layer_precision_mode, zp_comp_rank):
        accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, False)
        if zp_comp_rank > 0:
            zp_feed_repeat = tf.Variable(
                tf.random.normal([1]), dtype=tf.float32, name="zp_feed_repeat", shape=tf.TensorShape(None)
            )
            # we want to make it a vector also if given as a number
            zp_feed_repeat = tf.reshape(zp_feed_repeat, [-1])

            zp_comp_weights = weights_input[:, -zp_comp_rank:, :]
            weights_no_comp = weights_input[:, :-zp_comp_rank, :]

            d_in = data_input
            w_in = weights_no_comp
            for i in range(zp_comp_rank):
                repeat = tf.cast(zp_feed_repeat[i], dtype=tf.int32)
                zp_vec = tf.repeat(tf.ones_like(d_in[:, :, -1:]) * zp_in_data[i], repeat, axis=2)
                w_vec = tf.repeat(zp_comp_weights[:, i : i + 1, :], repeat, axis=1)

                d_in = tf.concat([d_in, zp_vec], axis=2)
                w_in = tf.concat([w_in, w_vec], axis=1)

        else:
            d_in, w_in = data_input, weights_input

        return self.hsim.h_matmul(
            d_in,
            w_in,
            self.emulation_mult_shift,
            accumulator_size=accumulator_size,
            use_fp16_acc=False,
            name="matmul",
        )

    def _build_feature_multiplier_layer(self, l_desc, inp):
        """
        Feature multiplier allow to get features multiplications from same height and width.
        This layer gets tensor with shape (input features, input features) as tf.Variable,
        each row represent power to set for each feature and the result is multiplication
        in feature dimension.
        In order to keep the scale, each feature that don't multiplied and exists in result tensor,
        the function multiplied it with quantized '1' value (equals 1 / input scale).
        This logic limits the power tensor possible values to contain only one of the following in each row:
        - 1 time "1" value - peek feature and multiply it with quantized '1'
        - 2 times "1" value - peek two features and multiply together
        - 1 time "2" value - peek one feature and square it.
        """

        def pow_and_multiply(input_tensor, pow_table):
            """
            Power the input_tensor by pow_table and multiply all the result in feature axis.
            Example: input_tensor:
                        [[[ 1, 2, 3],
                         [ 4, 5, 6]]]
                     pow_table:
                        [[[ 1, 1, 0 ],
                          [ 0, 0, 2 ],
                          [ 0, 1, 1 ]]
                     result:
                         [[[ 2, 9, 6 ],
                          [ 20, 36, 30]]]
            """
            input_tensor = tf.expand_dims(input_tensor, -1)
            pow_table_t = tf.transpose(pow_table, perm=[1, 0])
            pow_table_t = tf.reshape(pow_table_t, [1, 1, 1, *pow_table_t.shape])
            result = tf.pow(input_tensor, pow_table_t)
            return tf.reduce_prod(result, axis=-2)

        def square(input_tensor):
            return input_tensor * input_tensor

        features_in = l_desc.input_shape[-1]
        features_out = sum([shape[-1] for shape in l_desc.output_shapes])

        if l_desc.feature_multiplier_type != FeatureMultiplierType.square:
            power_table = tf.Variable(tf.zeros([features_out, features_in]), trainable=False, name="power_table")

        # If not numeric or partial numeric, keep scale 1.0.
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
            weights = tf.reshape(
                tf.Variable(
                    tf.random.normal([1, 1, l_desc.input_shape[-1], 1], stddev=0.35),
                    name="kernel",
                    shape=tf.TensorShape(None),
                ),
                [-1],
            )[0]
            shift = tf.cast(self.emulation_mult_shift, np.float32)
            bias_in = tf.Variable(0, dtype=tf.float32, name="bias_in")
            quantized_one = tf.Variable(0, dtype=tf.float32, name="quantized_one")

            # Add input bias
            inp = inp * (weights / shift) + bias_in
            inp = self._round_input_9bit(inp)

        if l_desc.feature_multiplier_type == FeatureMultiplierType.square:
            result = square(inp)
            if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
                result = self._ew_mult_bankers_rounding(result)
        else:
            result = pow_and_multiply(inp, power_table)
            if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
                # Calculate how many multipications each output feature is constructed from.
                power_sum = tf.reduce_sum(power_table, axis=1)
                # convert 2 -> 0 and 1 -> 1
                factor = 2 - power_sum
                # power the quantized_one by factor, result is 1 or quantized one
                factor_table = tf.pow(quantized_one, factor)
                result = result * factor_table

                result = self._ew_mult_bankers_rounding(result)

        return result

    def _build_bbox_decoder(self, l_desc, bbox_inputs):
        # ymin = y_center + anchors_heights * In1 + anchors_heights_minus_div_2 * In2
        # xmin = x_center + anchors_widths * In1 + anchors_widths_minus_div_2 * In2
        # ymax = y_center + anchors_heights * In1 + anchors_heights_div_2 * In2
        # xmax = x_center + anchors_widths * In1 + anchors_widths_div_2 * In2
        grid_size = [l_desc._input_shapes[0][1], l_desc._input_shapes[0][2]]
        num_of_anchors = int(l_desc._output_shapes[0][3] / 4)
        y_centers = tf.Variable(tf.zeros([grid_size[0], num_of_anchors * 2]), name="y_centers")
        x_centers = tf.Variable(tf.zeros([grid_size[1], num_of_anchors * 2]), name="x_centers")
        anchors_heights = tf.Variable(tf.zeros(num_of_anchors), name="anchors_heights")
        anchors_heights_div_2 = tf.Variable(tf.zeros(num_of_anchors), name="anchors_heights_div_2")
        anchors_heights_minus_div_2 = tf.Variable(tf.zeros(num_of_anchors), name="anchors_heights_minus_div_2")
        anchors_widths = tf.Variable(tf.zeros(num_of_anchors), name="anchors_widths")
        anchors_widths_div_2 = tf.Variable(tf.zeros(num_of_anchors), name="anchors_widths_div_2")
        anchors_widths_minus_div_2 = tf.Variable(tf.zeros(num_of_anchors), name="anchors_widths_minus_div_2")

        if len(bbox_inputs) == 1:
            in1_indx = list(range(0, bbox_inputs[0].shape[3], 4)) + list(range(1, bbox_inputs[0].shape[3], 4))
            in1_indx.sort()
            in2_indx = list(range(2, bbox_inputs[0].shape[3], 4)) + list(range(3, bbox_inputs[0].shape[3], 4))
            in2_indx.sort()
            input1 = tf.gather(bbox_inputs[0], in1_indx, axis=3)
            input2 = tf.gather(bbox_inputs[0], in2_indx, axis=3)
            bbox_inputs = [input1, input2]
        else:
            # Needed in order to force scale matching on both inputs
            self.conv_layers_inference[self._get_param_key(l_desc.name)]["second_input_name"] = bbox_inputs[1].name
            self.conv_layers_inference[self._get_param_key(l_desc.name)]["second_input_tensor"] = bbox_inputs[1]

        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            is_16bit_precision_mode = self._is_16bit_input(l_desc)
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                y_centers = tf.cast(y_centers, tf.int32)
                x_centers = tf.cast(x_centers, tf.int32)
                anchors_heights = tf.cast(anchors_heights, tf.int32)
                anchors_heights_div_2 = tf.cast(anchors_heights_div_2, tf.int32)
                anchors_heights_minus_div_2 = tf.cast(anchors_heights_minus_div_2, tf.int32)
                anchors_widths = tf.cast(anchors_widths, tf.int32)
                anchors_widths_div_2 = tf.cast(anchors_widths_div_2, tf.int32)
                anchors_widths_minus_div_2 = tf.cast(anchors_widths_minus_div_2, tf.int32)
            out = self.hsim.h_bbox_decoder(
                bbox_inputs[0],
                bbox_inputs[1],
                y_centers,
                x_centers,
                anchors_heights,
                anchors_heights_div_2,
                anchors_heights_minus_div_2,
                anchors_widths,
                anchors_widths_div_2,
                anchors_widths_minus_div_2,
                self.emulation_mult_shift,
                accumulator_size=self.accumulator_size,
                use_fp16_acc=False,
                name="op",
            )
        else:
            # In partial numeric the accumulator shift is done on the entire output (pre activation).
            # x_centers/y_centers are added to the output similar to double scale initialization bias.
            # Their values shouldn’t be shifted, so we compensate here for the shift at the end.
            if self.target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC:
                shift = tf.cast(self.emulation_mult_shift, np.float32)
                x_centers = tf.multiply(x_centers, shift)
                y_centers = tf.multiply(y_centers, shift)
            out = decode_branch(
                bbox_inputs[0],
                bbox_inputs[1],
                num_of_anchors,
                y_centers,
                x_centers,
                anchors_heights,
                anchors_heights_div_2,
                anchors_heights_minus_div_2,
                anchors_widths,
                anchors_widths_div_2,
                anchors_widths_minus_div_2,
            )

        return out

    def _build_bbox_decoder_single_weight_per_proposal(self, l_desc, bbox_inputs):
        # ymin = y_center - height_scale_factor * In2
        # xmin = x_center - width_scale_factor * In1
        # ymax = y_center + height_scale_factor * In4
        # xmax = x_center + width_scale_factor * In3
        assert len(bbox_inputs) == 1
        grid_size = [l_desc._input_shapes[0][1], l_desc._input_shapes[0][2]]
        num_of_anchors = int(l_desc._input_shapes[0][3] / 4)
        y_centers = tf.Variable(tf.zeros([grid_size[0], num_of_anchors * 2]), name="y_centers")
        x_centers = tf.Variable(tf.zeros([grid_size[1], num_of_anchors * 2]), name="x_centers")

        anchors_height_scale_factor = tf.Variable(tf.zeros(num_of_anchors), name="anchors_height_scale_factor")
        anchors_height_scale_factor_minus = tf.Variable(
            tf.zeros(num_of_anchors),
            name="anchors_height_scale_factor_minus",
        )
        anchors_width_scale_factor = tf.Variable(tf.zeros(num_of_anchors), name="anchors_width_scale_factor")
        anchors_width_scale_factor_minus = tf.Variable(
            tf.zeros(num_of_anchors),
            name="anchors_width_scale_factor_minus",
        )

        in1_indx = list(range(0, bbox_inputs[0].shape[3], 4)) + list(range(1, bbox_inputs[0].shape[3], 4))
        in1_indx.sort()
        in2_indx = list(range(2, bbox_inputs[0].shape[3], 4)) + list(range(3, bbox_inputs[0].shape[3], 4))
        in2_indx.sort()
        input1 = tf.gather(bbox_inputs[0], in1_indx, axis=3)
        input2 = tf.gather(bbox_inputs[0], in2_indx, axis=3)
        bbox_inputs = [input1, input2]

        is_16bit_precision_mode = self._is_16bit_input(l_desc)
        if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
            y_centers = tf.cast(y_centers, tf.int32)
            x_centers = tf.cast(x_centers, tf.int32)
            anchors_height_scale_factor = tf.cast(anchors_height_scale_factor, tf.int32)
            anchors_height_scale_factor_minus = tf.cast(anchors_height_scale_factor_minus, tf.int32)
            anchors_width_scale_factor = tf.cast(anchors_width_scale_factor, tf.int32)
            anchors_width_scale_factor_minus = tf.cast(anchors_width_scale_factor_minus, tf.int32)
        return self.hsim.h_single_weight_per_proposal_bbox_decoder(
            bbox_inputs[0],
            bbox_inputs[1],
            y_centers,
            x_centers,
            anchors_height_scale_factor,
            anchors_height_scale_factor_minus,
            anchors_width_scale_factor,
            anchors_width_scale_factor_minus,
            self.emulation_mult_shift,
            accumulator_size=self.accumulator_size,
            use_fp16_acc=False,
            name="op",
        )

    def _prepare_scores_for_proposal_generator(self, l_desc, scores, inter_layer_precision_mode):
        # The scores are multiplied with some weight in order to stretch them to 16bit
        kernel_shape = [1, 1, l_desc._input_shapes[1][-1], 1]
        score_weights_1 = tf.Variable(tf.zeros(kernel_shape), name="mock_op1/quant_kernel")
        score_weights_2 = tf.Variable(tf.zeros(kernel_shape), name="mock_op2/quant_kernel")
        score_weights_1 = tf.transpose(a=score_weights_1, perm=[0, 1, 3, 2])
        score_weights_2 = tf.transpose(a=score_weights_2, perm=[0, 1, 3, 2])
        accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, False)
        elementwise = tf.zeros([1], tf.float32)  # placeholder
        product_1 = self._build_numeric_conv(
            l_desc,
            scores,
            score_weights_1,
            [1, 1, 1, 1],
            elementwise,
            accumulator_size,
            self._get_strides(l_desc),
            self._get_dilation(l_desc),
            self.hsim.h_depth_wise,
        )
        product_2 = self._build_numeric_conv(
            l_desc,
            scores,
            score_weights_2,
            [1, 1, 1, 1],
            elementwise,
            accumulator_size,
            self._get_strides(l_desc),
            self._get_dilation(l_desc),
            self.hsim.h_depth_wise,
        )
        return tf.add(product_1, product_2)

    def _calculate_deconv_feature_interleave_output_shape(self, l_desc, feature_interleaver_index):
        if feature_interleaver_index < 2:
            return l_desc.output_shape

        return [
            l_desc.output_shape[0],
            int(l_desc.output_shape[1] / 2 ** (feature_interleaver_index - 1)),
            int(l_desc.output_shape[2] / 2 ** (feature_interleaver_index - 1)),
            int(l_desc.output_shape[3] * (2 ** (2 * (feature_interleaver_index - 1)))),
        ]

    def _build_deconv_feature_interleave(self, l_desc, inp):
        number_of_feature_interleavers = int(np.log2(l_desc.kernel_height) - 1)
        is_using_conv1x1s1 = l_desc.kernel_height == l_desc.stride_height
        if l_desc.kernel_height == 2 or (l_desc.kernel_height == 4 and l_desc.stride_height == 4):
            number_of_feature_interleavers = number_of_feature_interleavers + 1

        output_shape = self._calculate_deconv_feature_interleave_output_shape(l_desc, number_of_feature_interleavers)
        dont_remove_padding = l_desc.kernel_height == 2 or is_using_conv1x1s1

        result = self._build_feature_interleave(l_desc, inp, output_shape, not dont_remove_padding)

        while number_of_feature_interleavers > 1:
            number_of_feature_interleavers = number_of_feature_interleavers - 1
            output_shape = self._calculate_deconv_feature_interleave_output_shape(
                l_desc,
                number_of_feature_interleavers,
            )
            result = self._build_feature_interleave(l_desc, result, output_shape, False)

        return result

    def _build_super_deconv_depth_to_space(self, l_desc, inp):
        def build_group_depth_to_space(inp):
            block_size_h = l_desc.strides[1]
            block_size_w = l_desc.strides[2]
            height, width, channels = inp.shape[1:]
            out_h = int(height * block_size_h)
            out_w = int(width * block_size_w)
            out_c = int(channels // (block_size_h * block_size_w))
            op = tf.reshape(inp, (-1, height, width, block_size_h, block_size_w, out_c))
            op = tf.transpose(a=op, perm=(0, 1, 3, 2, 4, 5))
            return tf.reshape(op, (-1, out_h, out_w, out_c))

        group_size = inp.shape[-1] // l_desc.groups
        result = []
        for g in range(l_desc.groups):
            result.append(build_group_depth_to_space(inp[..., g * group_size : (g + 1) * group_size]))
        return tf.identity(tf.concat(result, axis=-1, name="deconv_d2s"))

    def _build_super_deconv_slice(self, l_desc, inp):
        strides = [l_desc.strides[1], l_desc.strides[2]]
        rates = [math.ceil(l_desc.kernel_height / strides[0]), math.ceil(l_desc.kernel_width / strides[1])]
        end_slice = [max(1, s // 2) for s in strides]
        start_slice = [a - b for a, b in zip(strides, end_slice)]
        if rates[0] % 2 != 1:
            inp = inp[:, start_slice[0] : -end_slice[0], :, :]
        if rates[1] % 2 != 1:
            inp = inp[:, :, start_slice[1] : -end_slice[1], :]
        return inp

    def _build_feature_interleave(self, l_desc, inp, output_shape, should_remove_padding):
        fi_rate = 2
        fi_reshape_shape = [output_shape[0], output_shape[1], old_div(output_shape[2], 2), 1, output_shape[3]]
        splitf = tf.split(inp, num_or_size_splits=output_shape[3], axis=3)
        reshape_splitf = tf.stack(splitf, axis=4)
        split0, split1, split2, split3 = tf.split(reshape_splitf, num_or_size_splits=(fi_rate**2), axis=3)

        if should_remove_padding is False:
            split_NW = split0
            split_NE = split1
            split_SW = split2
            split_SE = split3
        else:
            slice_size = [
                output_shape[0],
                old_div(output_shape[1], fi_rate),
                old_div(output_shape[2], fi_rate),
                1,
                output_shape[3],
            ]
            split_NW = tf.slice(split0, [0, 0, 0, 0, 0], slice_size)
            split_NE = tf.slice(split1, [0, 0, 1, 0, 0], slice_size)
            split_SW = tf.slice(split2, [0, 1, 0, 0, 0], slice_size)
            split_SE = tf.slice(split3, [0, 1, 1, 0, 0], slice_size)
        stack_NW_SW = tf.stack((split_NW, split_SW), axis=2)
        stack_NE_SE = tf.stack((split_NE, split_SE), axis=2)
        reshape_NW_SW = tf.reshape(stack_NW_SW, shape=fi_reshape_shape)
        reshape_NE_SE = tf.reshape(stack_NE_SE, shape=fi_reshape_shape)
        stack_all = tf.stack((reshape_NW_SW, reshape_NE_SE), axis=-2)
        feature_interleave = tf.reshape(stack_all, shape=output_shape, name="feature_interleave")
        if l_desc in self.mixed_numeric_layers and l_desc.op != LayerType.deconv:
            feature_interleave = self._from_numeric(feature_interleave)
        self.nodes[l_desc] = TFNode(l_desc.name, None, None, None, None, None, None, [feature_interleave])
        return feature_interleave

    def _build_kernel(self, kernel_var, l_desc):
        if self._enable_clipping:
            output_features = (
                l_desc.output_width
                if l_desc.op in [LayerType.batch_norm, LayerType.normalization, LayerType.dw, LayerType.conv]
                and l_desc.transpose_output_width_features
                else l_desc.output_features
            )
            min_vals_per_channel = tf.fill([output_features], -np.inf)
            max_vals_per_channel = tf.fill([output_features], np.inf)
            wcv = tf.Variable(
                initial_value=[min_vals_per_channel, max_vals_per_channel],
                trainable=False,
                name="weights_clipping_values",
            )
            if l_desc.op in [LayerType.dw, LayerType.normalization]:
                min_vals_per_channel = tf.reshape(wcv[0], (output_features, 1))
                max_vals_per_channel = tf.reshape(wcv[1], (output_features, 1))
                min_vals = min_vals_per_channel * tf.fill(kernel_var.shape.as_list(), 1.0)
                max_vals = max_vals_per_channel * tf.fill(kernel_var.shape.as_list(), 1.0)
            else:
                min_vals = tf.broadcast_to(wcv[0], kernel_var.shape.as_list())
                max_vals = tf.broadcast_to(wcv[1], kernel_var.shape.as_list())
            self._add_variables_to_initialize(l_desc.name, [wcv])
            clipped_kernel = tf.clip_by_value(
                kernel_var,
                clip_value_min=min_vals,
                clip_value_max=max_vals,
                name="clipped_kernel",
            )
            return clipped_kernel, kernel_var
        return kernel_var, kernel_var

    def _build_dense(self, l_desc, inp, inter_layer_precision_mode):
        """
        Builds fully connected matrix multiplication (dense) node for graph.
        """
        is_16bit_precision_mode = self._is_16bit_input(l_desc)
        accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, is_16bit_precision_mode)
        k, k_var = self._build_kernel(
            tf.Variable(tf.random.normal(l_desc.kernel_shape, stddev=0.35), name="kernel"),
            l_desc,
        )
        self._kernel_variables[l_desc.name] = k_var

        # Flatten
        inp = tf.reshape(inp, shape=[-1, math.prod(inp.shape[1:])])

        # HMatMul is generic and supports both Dense and Matmul operators
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                k = tf.cast(k, tf.int32)
            op = self.hsim.h_matmul(
                inp,
                k,
                self.emulation_mult_shift,
                accumulator_size=accumulator_size,
                use_fp16_acc=False,
                name="matmul_op",
            )

            if self._is_zp_required(is_16bit_precision_mode):
                zp_kernel = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name="zp_kernel")
                self._add_variables_to_initialize(l_desc.name, [zp_kernel])
                zp_k = tf.ones([k.shape[0], 1], dtype=tf.int32)
                zp_k = tf.scalar_mul(-zp_kernel, zp_k)
                op_zp = self.hsim.h_matmul(
                    inp,
                    zp_k,
                    self.emulation_mult_shift,
                    accumulator_size=accumulator_size,
                    use_fp16_acc=False,
                    name="op_zp",
                )

                op_zp = tf.tile(op_zp, [1, k.shape[1]])
                op = self.hsim.h_add(
                    op,
                    op_zp,
                    1,
                    1,
                    self.emulation_mult_shift,
                    accumulator_size=accumulator_size,
                    use_fp16_acc=False,
                    name="op_after_zp",
                )
        else:
            if self.target in [EmulationInferenceTargets.SDK_PARTIAL_NUMERIC] and self._is_zp_required(
                is_16bit_precision_mode
            ):
                zp_kernel = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name="zp_kernel")
                self._add_variables_to_initialize(l_desc.name, [zp_kernel])
                k = k - zp_kernel

            if self.target in [EmulationInferenceTargets.SDK_FINE_TUNE]:
                with tf.compat.v1.variable_scope("fine_tune_weights"):
                    kernel_delta = tf.Variable(tf.zeros(tf.shape(k)), dtype=tf.float32, name="kernel_delta")
                self._kernel_delta_variables[l_desc.name] = kernel_delta
                if self.fine_tune_params.should_quantize_weights and l_desc.name not in self.mixed_native_layers:
                    k = self._build_fine_tune_weights(k, l_desc, inter_layer_precision_mode)

            # native dense layer
            op = tf.matmul(inp, k, name="matmul_op")

        op = tf.reshape(op, [-1, 1, 1, op.shape[-1]], name="op")
        return op, k

    def _get_conv_op_tf(self, l_desc, inputs, filters, strides):
        """
        Returns appropriate tf op for convolutional layers
        """
        inp = inputs[0]
        zp_kernel = None
        if self.target in [EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
            if self._is_zp_required(self._is_16bit_input(l_desc)):
                zp_kernel = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name="zp_kernel")
                self._add_variables_to_initialize(l_desc.name, [zp_kernel])

        if l_desc.op in [LayerType.deconv, LayerType.conv] and l_desc.groups > 1:
            concat_inputs = []
            input_group_size = int(l_desc.input_shape[3] / l_desc.groups)
            output_group_size = int(int(filters.shape[3]) / l_desc.groups)

            for g in range(l_desc.groups):
                group_input = inp[:, :, :, g * input_group_size : (g + 1) * input_group_size]
                group_kernel = filters[:, :, :, g * output_group_size : (g + 1) * output_group_size]

                if zp_kernel is not None:
                    group_kernel = group_kernel - zp_kernel

                if l_desc.op == LayerType.deconv and self.target != EmulationInferenceTargets.SDK_PARTIAL_NUMERIC:
                    deconv_output_shape = tf.stack(
                        [tf.shape(input=inp)[0], l_desc.output_shape[1], l_desc.output_shape[2], output_group_size],
                    )
                    deconv_reshaped_filters = tf.transpose(a=group_kernel, perm=[0, 1, 3, 2])
                    conv_op = tf.nn.conv2d_transpose(
                        group_input,
                        deconv_reshaped_filters,
                        deconv_output_shape,
                        strides=strides,
                        padding="SAME",
                        name="op",
                    )
                else:
                    conv_op = tf.nn.conv2d(
                        input=group_input,
                        filters=group_kernel,
                        strides=strides,
                        padding="VALID",
                        dilations=l_desc.dilations,
                        name="op",
                    )
                concat_inputs.append(conv_op)
            return tf.concat(concat_inputs, 3, name="op")

        if zp_kernel is not None:
            filters = filters - zp_kernel

        if l_desc.op == LayerType.conv:
            return tf.nn.conv2d(
                input=inp,
                filters=filters,
                strides=strides,
                padding="VALID",
                dilations=l_desc.dilations,
                name="op",
            )
        elif l_desc.op == LayerType.reduce_sum:
            return tf.nn.conv2d(input=inp, filters=filters, strides=[1, 1, 1, 1], padding="VALID", name="op")
        elif l_desc.op == LayerType.dw:
            rate_h, rate_w = l_desc.dilations[1:3]
            # in tensorflow 1.7 depthwise_conv2d api there is no dilation param, there is a rate param ( in 2.0 there is a dilation param).
            # rate > 1 and strides are not supported.
            if (rate_h > 1 or rate_w > 1) and (strides[1] > 1 or strides[2] > 1):
                raise BackendEmulatorException(
                    f"native depthwise does not support dilation {l_desc.dilations[1:3]} and strides {strides}",
                )
            if l_desc.dynamic_weights:
                batch = tf.shape(inp)[0]
                weights_input = inputs[1]
                channels = tf.shape(weights_input)[-1]
                inp_reshaped = tf.reshape(
                    inp,
                    (1, tf.shape(inp)[1], tf.shape(inp)[2], batch * channels),
                    name="data_input_reshape",
                )
                filters = tf.reshape(
                    filters,
                    (tf.shape(weights_input)[1], tf.shape(weights_input)[2], batch * channels, 1),
                    name="weight_input_reshape",
                )
                result = tf.nn.depthwise_conv2d(
                    input=inp_reshaped,
                    filter=filters,
                    strides=strides,
                    padding="VALID",
                    dilations=(rate_h, rate_w),
                    name="op",
                )
                return tf.reshape(
                    result,
                    (-1, l_desc.output_shape[1], l_desc.output_shape[2], l_desc.output_shape[3]),
                    name="op_reshape",
                )
            # tf does not support depthwise with different strides on height and width, group conv is a workaround
            elif strides[1] != strides[2]:
                groups = filters.shape[-2]
                filters = tf.transpose(filters, (0, 1, 3, 2))
                all_groups = []
                for g in range(groups):
                    group_input = inp[:, :, :, g : g + 1]
                    group_kernel = filters[:, :, :, g : g + 1]
                    conv_group = tf.keras.backend.conv2d(group_input, group_kernel, strides=strides, padding="valid")
                    all_groups.append(conv_group)
                return tf.concat(all_groups, axis=-1)
            else:
                return tf.nn.depthwise_conv2d(
                    input=inp,
                    filter=filters,
                    strides=strides,
                    padding="VALID",
                    dilations=(rate_h, rate_w),
                    name="op",
                )

        elif l_desc.op == LayerType.deconv:
            if self.target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC:
                return tf.nn.conv2d(input=inp, filters=filters, strides=strides, padding="VALID", name="op")
            else:
                batch_size = tf.shape(input=inp)[0]
                deconv_output_shape = tf.stack(
                    [batch_size, l_desc.output_shape[1], l_desc.output_shape[2], l_desc.output_shape[3]],
                )
                deconv_reshaped_filters = tf.transpose(a=filters, perm=[0, 1, 3, 2])
                return tf.nn.conv2d_transpose(
                    inp,
                    deconv_reshaped_filters,
                    deconv_output_shape,
                    strides,
                    "SAME",
                    name="op",
                )
        elif l_desc.op == LayerType.avgpool:
            if self.target in [EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
                kernel = tf.expand_dims(tf.squeeze(filters, 2), 3)
                # tf.compat.v1.nn.depthwise_conv2d_native do not support non-equal strides.
                # In case the layer is global avgpool and the strides are not equal we can just change the strides to
                # [1, 1, 1, 1] as the strides actually do not have any affect on the results as the kernel is the same
                # size of the input and the padding is VALID
                if l_desc.is_global_avg_pool():
                    strides = [1, 1, 1, 1]
                elif l_desc.kernel_shape[2] == l_desc.input_shape[2]:
                    strides = [1, strides[1], strides[1], 1]
                return tf.compat.v1.nn.depthwise_conv2d_native(inp, kernel, strides, "VALID", name="op")
            else:
                if l_desc.is_tiled_avg_pool() and l_desc.padding == PaddingType.same:
                    return tf.nn.avg_pool2d(
                        input=inp,
                        ksize=l_desc.kernel_shape,
                        strides=strides,
                        padding="SAME",
                        name="op",
                    )
                return tf.nn.avg_pool2d(
                    input=inp,
                    ksize=l_desc.kernel_shape,
                    strides=strides,
                    padding="VALID",
                    name="op",
                )
        else:
            raise BackendNotImplementedError("Unknown convolutional layer type")

    def _build_conv(
        self,
        l_desc,
        inputs,
        inter_layer_precision_mode,
        weight_op_idx=None,
        elementwise=None,
        k_vars=None,
        pad_val=None,
    ):
        """
        Adds Convolutional operation to the graph.
        """
        # For h_conv2d, if elementwise size is different than output size it doesn't perform elementwise add
        # In this case we just give a dummy sized 1 tensor
        if len(inputs[0].shape) == 2:
            assert l_desc.op == LayerType.dw, "2D input supported for depthwise layer"
            inp_shape = inputs[0].shape
            inp = tf.reshape(inputs[0], [-1, 1, 1, inp_shape[1]])
        else:
            inp = inputs[0]

        is_elemetwise = True
        is_16bit_precision_mode = self._is_16bit_input(l_desc)
        if elementwise is None:
            elementwise = tf.zeros([1], inp.dtype)
            is_elemetwise = False

        accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, is_16bit_precision_mode)
        k = None
        strides = self._get_strides(l_desc)
        kernel_shape = l_desc.kernel_shape

        if l_desc.op in [LayerType.conv]:
            kernel_shape = [
                l_desc.kernel_shape[0],
                l_desc.kernel_shape[1],
                int(l_desc.kernel_shape[2] / sum(l_desc.group_sizes) * max(l_desc.group_sizes)),
                l_desc.kernel_shape[3],
            ]
        if l_desc.op == LayerType.avgpool:
            if self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
            ]:
                kernel_shape = [l_desc.kernel_shape[1], l_desc.kernel_shape[2], 1, l_desc.input_shape[3]]
            else:
                kernel_shape = [l_desc.kernel_shape[1], l_desc.kernel_shape[2], 1, 1]
        if l_desc.op in (LayerType.reduce_sum, LayerType.reduce_mean):
            kernel_shape = l_desc.kernel_shape
        elif l_desc.op == LayerType.deconv:
            use_super_deconv = is_super_deconv(l_desc)
            if self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
                EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
            ]:
                strides = [1, 1, 1, 1]

                if use_super_deconv:
                    kernel_shape = [
                        int(np.ceil(l_desc.kernel_shape[0] / l_desc.strides[1])),
                        int(np.ceil(l_desc.kernel_shape[1] / l_desc.strides[2])),
                        l_desc.kernel_shape[2] // l_desc.groups,
                        l_desc.kernel_shape[3] * l_desc.strides[1] * l_desc.strides[2],
                    ]
                else:
                    new_kernel_h = int(np.ceil(l_desc.kernel_shape[0] / l_desc.strides[1]))
                    new_kernel_w = int(np.ceil(l_desc.kernel_shape[1] / l_desc.strides[2]))

                    kernel_shape = [
                        new_kernel_h,
                        new_kernel_w,
                        int(l_desc.kernel_shape[2] / l_desc.groups),
                        l_desc.kernel_shape[3] * (l_desc.strides[1] * l_desc.strides[2]),
                    ]
            else:
                kernel_shape = [
                    l_desc.kernel_shape[0],
                    l_desc.kernel_shape[1],
                    int(l_desc.kernel_shape[2] / l_desc.groups),
                    l_desc.kernel_shape[3],
                ]
        use_split_weights = (
            self.is_pluto_arch
            and self._is_16bit_weights(l_desc)
            and self.target
            in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]
        )
        if use_split_weights:
            kernel_shape = l_desc.kernel_shape + [2]
        if l_desc.op == LayerType.deconv and self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
        ]:
            if use_super_deconv:
                k, k_var = self._build_kernel(
                    tf.Variable(tf.random.normal(kernel_shape, stddev=0.35), name="conv_kernel"),
                    l_desc,
                )
            else:
                deconv_k_shape = [
                    l_desc.kernel_shape[0],
                    l_desc.kernel_shape[1],
                    int(l_desc.kernel_shape[2] / l_desc.groups),
                    l_desc.kernel_shape[3],
                ]
                k, k_var = self._build_kernel(
                    tf.Variable(tf.random.normal(deconv_k_shape, stddev=0.35), name="kernel"),
                    l_desc,
                )
                if not ((deconv_k_shape[0] == 2) or (deconv_k_shape[0] == 4 and l_desc.strides[1] == 4)):
                    k = k[::-1, ::-1]
                k = self._build_deconv_split_channels(l_desc, k)
            self._kernel_variables[l_desc.name] = k_var

        elif (
            l_desc.op == LayerType.dw
            and not l_desc.dynamic_weights
            and self.target
            in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]
        ):
            k, k_var = self._build_kernel(
                tf.Variable(tf.random.normal(kernel_shape, stddev=0.35), name="kernel"),
                l_desc,
            )
            self._kernel_variables[l_desc.name] = k_var
            k = tf.transpose(a=k, perm=[0, 1, 3, 2])
        elif l_desc.op in [LayerType.avgpool, LayerType.reduce_sum, LayerType.reduce_mean]:
            # avgpool should not create a weights clipping op because the weights are const
            k = tf.Variable(tf.random.normal(kernel_shape, stddev=0.35), name="kernel")
        else:
            if l_desc.dynamic_weights:
                k = inputs[1]
                k_var = inputs[1]
            elif l_desc.input_disparity > 1 or kernel_shape[2] // l_desc.input_features > 1:
                k, k_var = k_vars
            else:
                kernel_var = self._get_variable_with_reuse(
                    "kernel", initial_value=tf.random.normal(kernel_shape, stddev=0.35), name="kernel"
                )
                k, k_var = self._build_kernel(
                    kernel_var,
                    l_desc,
                )
                if use_split_weights:
                    k = k[..., weight_op_idx]
                    k_var = k_var[..., weight_op_idx]
                    kernel_shape = k.shape
            self._kernel_variables[l_desc.name] = k_var

        if self.target in [EmulationInferenceTargets.SDK_FINE_TUNE] and l_desc.op != LayerType.avgpool:
            with tf.compat.v1.variable_scope("fine_tune_weights"):
                kernel_delta = tf.Variable(tf.zeros(tf.shape(k)), dtype=tf.float32, name="kernel_delta")
            self._kernel_delta_variables[l_desc.name] = kernel_delta
            if self.fine_tune_params.should_quantize_weights and l_desc.name not in self.mixed_native_layers:
                k = self._build_fine_tune_weights(k, l_desc, inter_layer_precision_mode)

        input_shape = (
            l_desc.input_shape if len(l_desc.input_shape) != 2 else [l_desc.input_shape[0], 1, 1, l_desc.input_shape[1]]
        )
        (pad_beg_h, pad_end_h, pad_beg_w, pad_end_w) = calculate_padding(
            l_desc.padding,
            kernel_shape[0],
            kernel_shape[1],
            strides[1],
            strides[2],
            input_shape[1],
            input_shape[2],
            self._get_dilation(l_desc),
        )

        # if new quant, we need to pad with non-zero as there is a different zero point for the quantized data
        if self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
        ]:
            if not (
                l_desc.op in [LayerType.conv]
                and (l_desc.input_disparity > 1 or kernel_shape[2] // l_desc.input_features > 1)
            ):
                zp_in = tf.reshape(
                    tf.Variable(
                        tf.random.normal([l_desc.output_features], stddev=0.35),
                        name="zero_point_in",
                        shape=tf.TensorShape(None),
                    ),
                    [-1],
                )[0]

                pad_val = self._get_const("padding_const_value", zp_in)
            # ...TF lacks padding with non-zero; implementing by wrapping zero-pad with subtract and add back..
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                pad_val = tf.cast(pad_val, tf.int32)
                inp = tf.cast(inp, tf.int32)
                if elementwise is not None:
                    elementwise = tf.cast(elementwise, tf.int32)
            inp = tf.add(inp, -pad_val)
            inp_p = tf.pad(
                tensor=inp,
                paddings=[[0, 0], [pad_beg_h, pad_end_h], [pad_beg_w, pad_end_w], [0, 0]],
                mode="CONSTANT",
            )
            inp_p = tf.add(inp_p, pad_val)
        elif l_desc.op in [LayerType.deconv]:
            inp_p = inp
        else:
            inp_p = tf.pad(
                tensor=inp,
                paddings=[[0, 0], [pad_beg_h, pad_end_h], [pad_beg_w, pad_end_w], [0, 0]],
                mode="CONSTANT",
            )

        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            # For h_conv2d, if elementwise size is different than output size it doesn't perform elementwise add
            # In this case we just give a dummy sized 1 tensor.
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                k = tf.cast(k, tf.int32)
            if elementwise is None:
                if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                    elementwise = tf.zeros([1], tf.int32)
                else:
                    elementwise = tf.zeros([1], tf.float32)
            if (
                l_desc.op == LayerType.avgpool
                or l_desc.op == LayerType.dw
                or (l_desc.op in [LayerType.reduce_sum, LayerType.reduce_mean] and l_desc.is_reduce_height_or_width())
            ):
                if l_desc.dynamic_weights:
                    k = tf.reshape(k, (tf.shape(k)[0], tf.shape(k)[1], tf.shape(k)[2], 1, tf.shape(k)[3]))

                if l_desc.op == LayerType.dw:
                    op = self._build_numeric_conv(
                        l_desc,
                        inp_p,
                        k,
                        kernel_shape,
                        elementwise,
                        accumulator_size,
                        strides,
                        self._get_dilation(l_desc),
                        self.hsim.h_depth_wise,
                    )
                else:
                    op = self.hsim.h_depth_wise(
                        inp_p,
                        k,
                        elementwise,
                        self.emulation_mult_shift,
                        strides,
                        self._get_dilation(l_desc),
                        "VALID",
                        accumulator_size=accumulator_size,
                        use_fp16_acc=False,
                        name="op",
                    )

            else:
                dilation = self._get_dilation(l_desc)
                hsim_op = self.hsim.h_conv2d_dilation if dilation[1] > 1 or dilation[2] > 1 else self.hsim.h_conv2d

                if l_desc.groups > 1:
                    op = self._build_numeric_group_conv(
                        l_desc,
                        inp_p,
                        k,
                        is_elemetwise,
                        elementwise,
                        accumulator_size,
                        strides,
                        dilation,
                        hsim_op,
                    )
                else:
                    op = self._build_numeric_conv(
                        l_desc,
                        inp_p,
                        k,
                        kernel_shape,
                        elementwise,
                        accumulator_size,
                        strides,
                        dilation,
                        hsim_op,
                    )
        else:
            inp_vector = [inp_p]
            if len(inputs) > 1:
                inp_vector.append(inputs[1])
            op = self._get_conv_op_tf(l_desc, inp_vector, k, strides)

        return op, k

    def _build_conv_decompose(self, l_desc, inputs, inter_layer_precision_mode, elementwise=None):
        low_input = tf.bitwise.bitwise_and(inputs[0], 0xFF)
        high_input = tf.bitwise.bitwise_and(tf.bitwise.right_shift(inputs[0], 8), 0xFF)

        conv_results = {}
        sub_conv_scope = {}
        for suf in ["ll", "hl", "lh", "hh"]:
            with tf.compat.v1.variable_scope(f"conv_{suf}") as sub_conv_scope[suf]:
                orig_scope = self.current_scope
                self.current_scope = sub_conv_scope[suf]

                sub_conv = copy.deepcopy(l_desc)
                sub_conv.precision_config.precision_mode = PrecisionMode.a8_w8_a16
                sub_convs_inter_layer_precision_mode = InterLayerPrecisionMode.from_precision_mode(
                    PrecisionMode.a8_w8,
                    PrecisionMode.a8_w8,
                )
                self.emulation_shift = tf.cast(
                    tf.Variable(0, dtype=tf.int8, trainable=False, name="output_stage/mult_shift"),
                    dtype=tf.float32,
                )
                self._set_emulation_mult_shift(l_desc, False)
                inp = low_input if suf[0] == "l" else high_input
                op, _ = self._build_conv(
                    sub_conv,
                    [tf.cast(inp, dtype=tf.float32)],
                    sub_convs_inter_layer_precision_mode,
                )
                b_op = self._build_bias(sub_conv, op, sub_convs_inter_layer_precision_mode)
                conv_results[suf] = b_op

                self.current_scope = orig_scope
        all_convs_op = tf.concat(
            [conv_results["ll"], conv_results["hl"], conv_results["lh"], conv_results["hh"]],
            axis=3,
        )

        with tf.name_scope(sub_conv_scope["ll"].original_name_scope):
            all_convs = copy.deepcopy(l_desc)
            all_convs.precision_config.precision_mode = PrecisionMode.a8_w8_a16
            all_convs.precision_config.quantization_groups = 4
            self.activation_points[l_desc.name] = self.activation_points[l_desc.name + "/conv_ll"]
            self.local_consts[self.current_scope.name] = {}
            self.local_consts[self.current_scope.name]["size_splits"] = tf.Variable(
                tf.ones([4], dtype=tf.int32),
                dtype=tf.int32,
                trainable=False,
                name="output_stage/piecewise/size_splits",
            )
            sub_convs_to_ew_add_layer_precision_mode = InterLayerPrecisionMode.from_precision_mode(
                PrecisionMode.a8_w8,
                PrecisionMode.a16_w16,
            )
            if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
                convs_post_act, _, _ = self._build_output_stage(
                    all_convs_op,
                    all_convs,
                    sub_convs_to_ew_add_layer_precision_mode,
                )

        with tf.compat.v1.variable_scope("ew_add", reuse=tf.compat.v1.AUTO_REUSE):
            from hailo_sdk_common.hailo_nn.hn_layers import EWAddLayer

            ew_add_desc = EWAddLayer.from_layer(l_desc)
            ew_add_desc.precision_config.bias_mode = BiasMode.double_scale_initialization
            self.emulation_shift = tf.cast(
                tf.Variable(0, dtype=tf.int8, trainable=False, name="output_stage/mult_shift"),
                dtype=tf.float32,
            )
            self._set_emulation_mult_shift(l_desc, True)
            ew_add_l = self._build_standalone_ew_add_sub(
                ew_add_desc,
                tf.split(convs_post_act, 4, axis=3),
                inter_layer_precision_mode,
                LayerType.ew_add,
                [0, 2],
            )
            ew_add_h = self._build_standalone_ew_add_sub(
                ew_add_desc,
                tf.split(convs_post_act, 4, axis=3),
                inter_layer_precision_mode,
                LayerType.ew_add,
                [1, 3],
            )
            ew_add_total = tf.add(ew_add_l, ew_add_h)
            b_add_op = self._build_bias(l_desc, ew_add_total, inter_layer_precision_mode)
            self.local_consts[self.current_scope.name]["size_splits"] = tf.Variable(
                tf.ones([1], dtype=tf.int32),
                dtype=tf.int32,
                trainable=False,
                name="output_stage/piecewise/size_splits",
            )
            ew_add_post_act, _, _ = self._build_output_stage(b_add_op, l_desc, inter_layer_precision_mode)

        return ew_add_post_act

    def _build_numeric_group_conv(
        self,
        l_desc,
        inp_p,
        k,
        is_elemetwise,
        elementwise,
        accumulator_size,
        strides,
        dilation,
        hsim_op,
    ):
        is_zp_required = self._is_zp_required(self._is_16bit_input(l_desc))
        concat_inputs = []
        input_group_sizes = [int(l_desc.input_shape[3] * s / sum(l_desc.group_sizes)) for s in l_desc.group_sizes]
        output_group_sizes = [int(int(k.shape[3]) * s / sum(l_desc.group_sizes)) for s in l_desc.group_sizes]
        if is_zp_required:
            zp_kernel = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name="zp_kernel")
            self._add_variables_to_initialize(l_desc.name, [zp_kernel])
        input_offset = 0
        output_offset = 0
        for g in range(l_desc.groups):
            group_input = inp_p[:, :, :, input_offset : input_offset + input_group_sizes[g]]
            group_kernel = k[:, :, : input_group_sizes[g], output_offset : output_offset + output_group_sizes[g]]
            if is_elemetwise:
                group_elementwise = elementwise[:, :, :, output_offset : output_offset + output_group_sizes[g]]
            else:
                group_elementwise = elementwise
            input_offset += input_group_sizes[g]
            output_offset += output_group_sizes[g]

            conv_op = hsim_op(
                group_input,
                group_kernel,
                group_elementwise,
                self.emulation_mult_shift,
                strides,
                dilation,
                "VALID",
                accumulator_size=accumulator_size,
                use_fp16_acc=False,
                name="op",
            )
            if is_zp_required:
                k1, k2, f_in, _ = k.shape
                zp_conv_k = tf.ones([k1, k2, input_group_sizes[g], 1], dtype=tf.int32)
                zp_conv_k = tf.scalar_mul(-zp_kernel, zp_conv_k)
                op_zp = hsim_op(
                    group_input,
                    tf.cast(zp_conv_k, tf.int32),
                    tf.zeros([1], tf.int32),
                    self.emulation_mult_shift,
                    strides,
                    dilation,
                    "VALID",
                    accumulator_size=accumulator_size,
                    use_fp16_acc=False,
                    name="op_zp",
                )
                op_zp = tf.tile(op_zp, [1, 1, 1, output_group_sizes[g]])
                conv_op = self.hsim.h_add(
                    conv_op,
                    op_zp,
                    1,
                    1,
                    self.emulation_mult_shift,
                    accumulator_size=accumulator_size,
                    use_fp16_acc=False,
                    name="op_after_zp",
                )

            concat_inputs.append(conv_op)
        return tf.concat(concat_inputs, 3, name="op")

    def _get_zp_kernel_shape(self, l_desc, k):
        k1, k2, f_in, f_out = k.shape
        if l_desc.op in [
            LayerType.dw,
            LayerType.normalization,
            LayerType.activation,
            LayerType.proposal_generator,
            LayerType.batch_norm,
        ]:
            zp_k = tf.ones([k1, k2, 1, f_out], dtype=tf.int32)
        else:
            zp_k = tf.ones([k1, k2, f_in, 1], dtype=tf.int32)
        return zp_k

    def _build_numeric_conv(
        self,
        l_desc,
        inp_p,
        k,
        kernel_shape,
        elementwise,
        accumulator_size,
        strides,
        dilation,
        hsim_op,
        name="op",
    ):
        is_16bit_precision_mode = self._is_16bit_input(l_desc)

        op = hsim_op(
            inp_p,
            k,
            elementwise,
            self.emulation_mult_shift,
            strides,
            dilation,
            "VALID",
            accumulator_size=accumulator_size,
            use_fp16_acc=False,
            name=name,
        )
        if is_16bit_precision_mode and not self.is_pluto_arch:
            zp_kernel = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name="zp_kernel")
            self._add_variables_to_initialize(l_desc.name, [zp_kernel])
            zp_k = tf.scalar_mul(-zp_kernel, self._get_zp_kernel_shape(l_desc, k))
            op_zp = hsim_op(
                inp_p,
                tf.cast(zp_k, tf.int32),
                tf.zeros([1], tf.int32),
                self.emulation_mult_shift,
                strides,
                dilation,
                "VALID",
                accumulator_size=accumulator_size,
                use_fp16_acc=False,
                name="op_zp",
            )
            if l_desc.op in [LayerType.deconv, LayerType.conv]:
                op_zp = tf.tile(op_zp, [1, 1, 1, kernel_shape[3]])
            op = self.hsim.h_add(
                op,
                op_zp,
                1,
                1,
                self.emulation_mult_shift,
                accumulator_size=accumulator_size,
                use_fp16_acc=False,
                name="op_after_zp",
            )
        return op

    def _build_deconv_split_channels(self, l_desc, k):
        stack_order = get_deconv_stack_order(l_desc.kernel_height, l_desc.strides[1])
        f_tmp = []
        for slice_h, slice_w in stack_order:
            stride_h = l_desc.strides[1]
            stride_w = l_desc.strides[2]
            f_slice = k[slice_h::stride_h, slice_w::stride_w]
            f_tmp.append(f_slice)

        f_stack = tf.stack((f_tmp), axis=4)
        f_squeeze = tf.squeeze(f_stack)
        return tf.reshape(
            f_squeeze,
            [f_stack.shape[0], f_stack.shape[1], int(k.shape[2]), int(k.shape[3]) * (l_desc.strides[1] ** 2)],
        )

    def _build_super_deconv_split_kernels(self, l_desc, k):
        stride_h = l_desc.strides[1]
        stride_w = l_desc.strides[2]

        kernel_h = l_desc.kernel_shape[0]
        kernel_w = l_desc.kernel_shape[1]
        input_features = l_desc.kernel_shape[2] // l_desc.groups
        output_features = l_desc.kernel_shape[3]

        new_kernel_h = int(np.ceil(kernel_h / stride_h))
        new_kernel_w = int(np.ceil(kernel_w / stride_w))

        pad_kernel_before_h = (
            0
            if (kernel_h % stride_h == 0 or kernel_h < stride_h)
            else (
                ((stride_h - (kernel_h % stride_h)) // 2)
                if (kernel_h // stride_h) % 2 == 0
                else (int(np.ceil(stride_h / 2)) - int(np.ceil((kernel_h % stride_h) / 2)))
            )
        )
        pad_kernel_before_w = (
            0
            if (kernel_w % stride_w == 0 or kernel_w < stride_w)
            else (
                ((stride_w - (kernel_w % stride_w)) // 2)
                if (kernel_w // stride_w) % 2 == 0
                else (int(np.ceil(stride_w / 2)) - int(np.ceil((kernel_w % stride_w) / 2)))
            )
        )

        conv_kernel_padded = tf.pad(k, ([pad_kernel_before_h, 0], [pad_kernel_before_w, 0], [0, 0], [0, 0]))

        f_col_tmp = []
        f_row_tmp = []
        for row in range(stride_h):
            for col in range(stride_w):
                start_h = 0 if row == 0 else stride_h - row
                start_w = 0 if col == 0 else stride_w - col

                sliced_kernel = conv_kernel_padded[start_h::stride_h, start_w::stride_w]

                pad_kernel_after_h = new_kernel_h - np.size(sliced_kernel, 0)
                pad_kernel_after_w = new_kernel_w - np.size(sliced_kernel, 1)
                sliced_kernel_padded = tf.pad(
                    sliced_kernel,
                    ([0, pad_kernel_after_h], [0, pad_kernel_after_w], [0, 0], [0, 0]),
                )

                f_col_tmp.append(sliced_kernel_padded)

            f_row_tmp.append(f_col_tmp.copy())
            f_col_tmp.clear()

        # Changing the order of the created sub-kernels:
        # For calculating the correct order, need to imagine how the full kernel "runs" over the "dilated" input, according to the deconv stride.
        # The sub-kernel which contributes to the first output pixel is not necessarily the first generated above, but rather depends on the padding.
        # Each padding row/col is similar to rolling the array of sub-kernels in the correct dimension.
        # Therefor, need to calculate the padding. Padding is given by the formula:
        #       kernel + output_size - 1 = pad + (input_size - 1) * stride + 1
        #       output_size = input_size * stride
        #                   ||
        #                   \/
        #       pad = kernel + stride - 2
        # In case of kernel >= stride, the amount of rolling is according to top-left padding for SAME padding.
        # In case of kernel < stride, the amount of rolling is (kernel-1)
        # On top of that, pad_kernel_before adds extra roll
        roll_h = kernel_h - 1 if (kernel_h < stride_h) else np.ceil((kernel_h + stride_h - 2) / 2)
        roll_w = kernel_w - 1 if (kernel_w < stride_w) else np.ceil((kernel_w + stride_w - 2) / 2)
        roll_h += pad_kernel_before_h
        roll_w += pad_kernel_before_w
        f_row_tmp = tf.roll(f_row_tmp, shift=[int(roll_h), int(roll_w)], axis=[0, 1])

        f_flat_rows = []
        for row in range(stride_h):
            for col in range(stride_w):
                f_flat_rows.append(f_row_tmp[row][col])

        f_stack = tf.stack((f_flat_rows), axis=3)
        f_perm = tf.transpose(a=f_stack, perm=[0, 1, 2, 4, 3])  # replace sub-kernels and output_features dimensions
        f_squeeze = tf.squeeze(f_perm)
        return tf.reshape(
            f_squeeze, [new_kernel_h, new_kernel_w, input_features, output_features * stride_h * stride_w]
        )

    def _get_bias_shape(self, l_desc):
        if l_desc.op in [
            LayerType.ew_add,
            LayerType.ew_sub,
            LayerType.ew_mult,
            LayerType.feature_multiplier,
        ]:
            return l_desc.output_shape[-1]
        if l_desc.op in [
            LayerType.batch_norm,
            LayerType.avgpool,
            LayerType.normalization,
            LayerType.activation,
        ]:
            if len(l_desc.input_shape) == 4:
                return l_desc.input_shape[3]
            else:
                return l_desc.input_shape[1]
        if l_desc.op in [LayerType.dw]:
            # in depthwise layer we calculate from kernel[3] which is depth_multiplier and not actual output shape
            return l_desc.kernel_shape[2] * l_desc.kernel_shape[3]
        if l_desc.op == LayerType.deconv and self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
        ]:
            return l_desc.kernel_shape[3] * l_desc.strides[1] * l_desc.strides[2]
        use_double_bias = self._get_const("precision_split_zp", 0)
        if use_double_bias:
            return [l_desc.kernel_shape[-1], 2]
        else:
            return l_desc.kernel_shape[-1:]

    def _build_bias_names(self, l_desc):
        bias_names = {
            "bias": "conv_bias" if l_desc.op == LayerType.deconv and is_super_deconv(l_desc) else "bias",
            "bias_factor": "bias_factor",
            "bias_feed_repeat": "bias_feed_repeat",
        }
        if l_desc.precision_config.bias_mode == BiasMode.double_scale_decomposition:
            bias_names.update(
                {
                    "bias_q_total_value": "bias_q_total_value",
                    "bias_q_a": "bias_q_int8_vec_a",
                    "bias_q_b": "bias_q_int8_vec_b",
                    "bias_factor_a": "bias_factor_a",
                    "bias_factor_b": "bias_factor_b",
                },
            )

        return bias_names

    def _build_decomposed_bias(self, l_desc, inp, bias_names):
        bias_shape = self._get_bias_shape(l_desc)
        b1 = tf.Variable(tf.zeros(bias_shape), name=bias_names["bias_q_a"])
        b2 = tf.Variable(tf.zeros(bias_shape), name=bias_names["bias_q_b"])
        _bias_total_value = tf.Variable(tf.zeros(bias_shape), name=bias_names["bias_q_total_value"])

        if self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
        ]:
            factor1 = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name=bias_names["bias_factor_a"])
            factor2 = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name=bias_names["bias_factor_b"])
            bias_feed_repeat = tf.Variable(
                initial_value=0,
                dtype=tf.float32,
                trainable=False,
                name=bias_names["bias_feed_repeat"],
            )

            if self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            ]:
                is_16bit_precision_mode = self._is_16bit_input(l_desc)
                if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                    factor1 = tf.cast(factor1, tf.int32)
                    factor2 = tf.cast(factor2, tf.int32)
                    b1 = tf.cast(b1, tf.int32)
                    b2 = tf.cast(b2, tf.int32)
                b_op = self.hsim.h_decomposed_bias_add(
                    inp,
                    b1,
                    b2,
                    factor1,
                    factor2,
                    bias_feed_repeat,
                    self.emulation_mult_shift,
                    accumulator_size=self.accumulator_size,
                    use_fp16_acc=False,
                    name="bias_op",
                )
            else:
                b = (b1 * factor1 + b2 * factor2) * bias_feed_repeat
                shift = tf.cast(self.emulation_mult_shift, b.dtype)
                b = tf.divide(b, shift)
                b_op = tf.nn.bias_add(inp, b, name="bias_op")
        else:
            # important: this is the original native bias, in this mode it's not overrun by bias_q
            b = tf.Variable(tf.zeros(bias_shape), name=bias_names["bias"])
            b_op = tf.nn.bias_add(inp, b, name="bias_op")
        return b_op

    def _build_bias(self, l_desc, inp, inter_layer_precision_mode, op_idx=None):
        """
        Builds Bias add to the graph.
        """
        is_16bit_precision_mode = self._is_16bit_input(l_desc)
        accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, is_16bit_precision_mode)
        bias_names = self._build_bias_names(l_desc)
        use_double_bias = self._get_const("precision_split_zp", 0)

        if l_desc.precision_config.bias_mode == BiasMode.double_scale_decomposition:
            return self._build_decomposed_bias(
                l_desc,
                inp,
                bias_names,
            )  # TODO add fine tune to this case when the time comes...

        is_bias_mode_double_scale_initialization = (
            l_desc.precision_config.bias_mode == BiasMode.double_scale_initialization
        )
        if (
            self._is_int32_numeric(l_desc, is_16bit_precision_mode) or self.is_pluto_arch
        ) and is_bias_mode_double_scale_initialization:
            # bias is actually a double, so read it as double and cast to int32 before calling op.
            bias_type = tf.float32 if self.is_pluto_arch else tf.float64
            b = self._get_variable_with_reuse(
                "bias",
                tf.zeros(self._get_bias_shape(l_desc), dtype=bias_type),
                name=bias_names["bias"],
                shape=tf.TensorShape(None),
            )
            if op_idx is not None:
                b = b[..., op_idx]
            # TODO: SDK-39483 - This reduce sum is due to a bug in bias calculation, should be fixed in quantization
            if l_desc.decompose_weights and not self.is_pluto_arch:
                b = tf.reshape(b, [4, -1])
                b = tf.reduce_sum(b, axis=0)
                b = tf.reshape(b, [-1])
        else:
            b = self._get_variable_with_reuse(
                "bias",
                tf.zeros(self._get_bias_shape(l_desc), dtype=tf.float32),
                name=bias_names["bias"],
                shape=tf.TensorShape(None),
            )
            if op_idx is not None:
                b = b[..., op_idx]
        if use_double_bias:
            bias_even, bias_odd = tf.split(b, 2, axis=1)
            bias_even = tf.squeeze(bias_even, 1)
            bias_odd = tf.squeeze(bias_odd, 1)

            # Get the width dimension
            bias_width = inp.shape[2]

            # Create indices for even and odd columns
            even_indices = tf.range(0, bias_width, 2)
            odd_indices = tf.range(1, bias_width, 2)
            even_output = tf.gather(inp, even_indices, axis=2)
            odd_output = tf.gather(inp, odd_indices, axis=2)

        self._bias_variables[l_desc.name] = b
        if self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
        ]:
            mult_shift = self.emulation_mult_shift
            if is_bias_mode_double_scale_initialization and (
                self._is_int32_numeric(l_desc, is_16bit_precision_mode) or self.is_pluto_arch
            ):
                # Usually in double_scale_initialization mode the factor compensates on the mul_shift,
                # but in 16bit we dont have a shift, so the factor always 1
                b_factor = tf.cast(1, dtype=tf.float32)
                bias_feed_repeat = tf.cast(1, dtype=tf.float32)
                if self.is_pluto_arch:
                    mult_shift = tf.cast(1, tf.int8)
            else:
                b_factor = self._get_variable_with_reuse(
                    "bias_factor",
                    initial_value=0,
                    dtype=tf.float32,
                    trainable=False,
                    name=bias_names["bias_factor"],
                    shape=tf.TensorShape(None),
                )
                b_factor = tf.reshape(b_factor, [-1])
                if op_idx is not None:
                    b_factor = b_factor[op_idx]
                bias_feed_repeat = self._get_variable_with_reuse(
                    "bias_feed_repeat",
                    initial_value=0,
                    dtype=tf.float32,
                    trainable=False,
                    name=bias_names["bias_feed_repeat"],
                    shape=tf.TensorShape(None),
                )
                if op_idx is not None:
                    bias_feed_repeat = bias_feed_repeat[op_idx]

            if self.target in [
                EmulationInferenceTargets.SDK_NUMERIC,
                EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
            ]:
                if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                    b = tf.cast(b, tf.int32)
                    b_factor = tf.cast(b_factor, tf.int32)
                if l_desc.op == LayerType.conv:
                    if l_desc.padding == PaddingType.valid and l_desc.input_disparity > 1:
                        output_disparity = math.ceil(
                            (
                                l_desc.input_disparity
                                - l_desc.kernel_shape[2] // (l_desc.input_features // l_desc.input_disparity)
                                + 1
                            )
                            / (l_desc.strides[-1] / (l_desc.input_features / l_desc.input_disparity)),
                        )
                    elif l_desc.input_disparity > 1:
                        output_disparity = math.ceil(
                            l_desc.input_disparity
                            / (l_desc.strides[-1] / (l_desc.input_features / l_desc.input_disparity)),
                        )
                    else:
                        output_disparity = l_desc.input_disparity
                else:
                    output_disparity = 1
                b = tf.concat([b] * output_disparity, axis=0)
                if use_double_bias:
                    b_odd_op = self.hsim.h_bias_add(
                        odd_output,
                        bias_odd,
                        b_factor,
                        bias_feed_repeat,
                        mult_shift,
                        accumulator_size=accumulator_size,
                        use_fp16_acc=False,
                        name="bias_op",
                    )
                    b_even_op = self.hsim.h_bias_add(
                        even_output,
                        bias_even,
                        b_factor,
                        bias_feed_repeat,
                        mult_shift,
                        accumulator_size=accumulator_size,
                        use_fp16_acc=False,
                        name="bias_op",
                    )
                    a = tf.concat((b_even_op, b_odd_op), axis=3)  # now we have tensor of (h, w/2, 2f)
                    shape = a.shape
                    b_op = tf.reshape(a, [-1, shape[1], shape[2] * 2, shape[3] // 2])

                    # Concatenate the interleaved slices along axis 2
                else:
                    b_op = self.hsim.h_bias_add(
                        inp,
                        b,
                        b_factor,
                        bias_feed_repeat,
                        mult_shift,
                        accumulator_size=accumulator_size,
                        use_fp16_acc=False,
                        name="bias_op",
                    )
            else:
                b = b * b_factor * bias_feed_repeat
                shift = tf.cast(self.emulation_mult_shift, b.dtype)
                b = tf.round(tf.divide(b, shift))
                b_op = tf.nn.bias_add(inp, b, name="bias_op")
        else:
            if self.target in [EmulationInferenceTargets.SDK_FINE_TUNE]:
                bias_shape = self._get_bias_shape(l_desc)
                fine_tune_bias = tf.Variable(tf.zeros(bias_shape), dtype=tf.float32, name="fine_tune_bias")
                self._bias_delta_variables[l_desc.name] = fine_tune_bias
                b = b + fine_tune_bias
                if l_desc.name not in self.mixed_native_layers:
                    accumulator_size = inter_layer_precision_mode.accumulator_size
                    bias_mode = l_desc.precision_config.bias_mode
                    num_bits = (
                        accumulator_size
                        if bias_mode is not None and bias_mode.value == "double_scale_initialization"
                        else accumulator_size / 2
                    )
                    if num_bits == 32:
                        # Both cannot (TF limitations) and need not (lots of bits..) quantize
                        return tf.nn.bias_add(inp, b, name="bias_op")

                    quant_groups = l_desc.precision_config.quantization_groups
                    bias_scale = self._get_const("scale_bias", translated=True)
                    if quant_groups > 1:
                        bias_scale_chw = np.ones(bias_shape)
                        group_size = bias_shape[-1] / quant_groups
                        for grind, bias_scale_gr in zip(range(quant_groups), bias_scale):
                            bias_scale_chw[int(grind * group_size) : int((grind + 1) * group_size)] = bias_scale_gr
                        bias_scale = bias_scale_chw

                    bias_factor = self._get_const("bias_factor", translated=True)
                    if bias_factor == 0:  # happens for normalization and maybe more..
                        bias_factor = 1
                    try:
                        residue = self._get_const("residue", translated=True)
                        accumulator_offset = self._get_const("accumulator_offset", translated=True)
                    except Exception:
                        residue = np.array(0.0)
                        accumulator_offset = np.array(0.0)

                    # Ideally we'd like to fully simulate the quantization process,
                    #    with residue, wraparound etc. (see quantize_model.py)

                    if residue.shape != b.shape:
                        residue = np.array(0.0)  # TODO take proper care of deconv where shape is different!
                        accumulator_offset = np.array(0.0)

                    # something goes wrong here when doing QFT+groupwise (any layer, not only this one..)
                    simulate_bias_wraparound = True
                    if simulate_bias_wraparound:
                        b_q = (b / bias_scale + residue) / bias_factor
                        acc_ovf = 2 ** (accumulator_size - 1)
                        warps = tf.floor((b_q + acc_ovf) / (2 * acc_ovf))
                        b_q -= warps * 2 * acc_ovf  # wrapping simulation
                    else:
                        b_q = b / bias_scale / bias_factor

                    # something goes wrong here when doing QFT+groupwise (any layer, not only this one..)
                    simulate_bias_quantization = True
                    if simulate_bias_quantization:
                        if num_bits == accumulator_size:
                            maxb = 2 ** (num_bits - 1) - 1
                        else:
                            # Scale is (roughly..) given by the bias range, a decomposition will be made to fully map onto -127,..,127
                            maxb = tf.reduce_max(input_tensor=tf.abs(b_q))
                        b_q = tf.quantization.fake_quant_with_min_max_vars(b_q, -maxb, maxb, num_bits=num_bits)

                    simulate_acc_wraparound = False  # try this at evaluation time..
                    if simulate_acc_wraparound:  # and residue.shape == b.shape:
                        # Align also the bias-add input (aka conv out)
                        #    to accumulator scale & offset - simulate result of *quantized* D_in * W
                        pre_bias_accumulator = (inp / bias_scale + accumulator_offset) / bias_factor
                        # Create fully simulated quantized accumulator (possible including wraparound!)
                        b_op = tf.add(pre_bias_accumulator, b_q)
                        b_op = tf.math.floormod(b_op + acc_ovf, 2 * acc_ovf) - acc_ovf
                        # - retrace our steps FOR THE ACCUMULATOR to return to "native" scale & offset for activation application.
                        #     note that in this mode acitvation is native (APU is NOT simulated (not yet..))
                        b_op *= bias_factor
                        b_op -= residue + accumulator_offset
                        b_op *= bias_scale
                        return tf.add(b_op, 0.0, name="bias_op")  # just to have the tensor by name..
                    elif simulate_bias_wraparound:
                        b_q += warps * 2 * acc_ovf
                        b = bias_scale * (b_q * bias_factor - residue)
                    else:
                        b = bias_scale * b_q * bias_factor

            b_op = tf.nn.bias_add(inp, b, name="bias_op")
        return b_op

    def _get_const(self, const_name, default_value=None, translated=False):
        if self.current_scope.name in self.local_consts:
            if const_name in self.local_consts[self.current_scope.name]:
                return self.local_consts[self.current_scope.name][const_name]

        scope_name = self.current_scope.name
        consts = self.consts if not translated else self.translated_consts
        if const_name in consts and scope_name in consts[const_name]:
            return np.float32(consts[const_name][scope_name])
        if default_value is not None:
            return default_value
        raise MissingConstError(
            f"No value found for const {const_name} while in scope {scope_name}",
        )

    def get_reused_variable(self, var_name):
        full_name = tf.get_current_name_scope() + var_name
        if full_name in self._reused_variables:
            return self._reused_variables[full_name]
        return None

    def set_reused_variable(self, var_name, value):
        full_name = tf.get_current_name_scope() + var_name
        self._reused_variables[full_name] = value

    def _get_variable_with_reuse(
        self, var_name, initial_value=None, trainable=None, name=None, dtype=None, constraint=None, shape=None
    ):
        var = self.get_reused_variable(var_name)
        if var is None:
            var = tf.Variable(
                initial_value, trainable=trainable, name=name, dtype=dtype, constraint=constraint, shape=shape
            )
            self.set_reused_variable(var_name, var)
        return var

    def _build_activation(self, activation, inp, l_desc, inter_layer_precision_mode):
        def _linear_activation(activation_input):
            return activation_input

        def _native_leaky_relu(activation_input):
            default_alpha = 0.2 if self.force_weightless_model else None
            alpha = self._get_const("leaky_alpha", default_alpha)
            return tf.nn.leaky_relu(activation_input, alpha=alpha)

        def _native_threshold_activation(activation_input):
            default_th = 0.3 if self.force_weightless_model else None
            th = self._get_const("activation_threshold", default_th)
            return tf.keras.activations.relu(activation_input, threshold=th)

        def _native_bias_delta_activation(activation_input):
            default_bias = -1.0 if self.force_weightless_model else None
            bias_val = self._get_const("activation_delta_bias", default_bias)
            return bias_val * tf.sign(tf.abs(activation_input))

        def _native_mish_activation(activation_input):
            return activation_input * tf.tanh(tf.nn.softplus(activation_input))

        def _native_inv_pos_activation(activation_input):
            return 1.0 / activation_input

        def _native_prelu_activation(activation_input):
            default_prelu_slope = np.array(0.25) if self.force_weightless_model else None
            prelu_slope = self._get_const("prelu_slope", default_prelu_slope)
            pos = tf.nn.relu(activation_input)
            neg = prelu_slope * (activation_input - abs(activation_input)) * 0.5

            return pos + neg

        def _native_hardswish_activation(activation_input):
            return activation_input * tf.nn.relu6(activation_input + 3) / 6

        def _native_swish_activation(activation_input):
            default_beta = 1.0 if self.force_weightless_model else None
            beta = self._get_const("swish_beta", default_beta)
            return activation_input * tf.nn.sigmoid(beta * activation_input)

        def _native_relu1_activation(activation_input):
            return tf.nn.relu6(activation_input * 6.0) / 6.0

        def _native_less_activation(activation_input):
            default_activation_less_values = np.array(0.0, dtype=np.float32) if self.force_weightless_model else None
            activation_less_values = self._get_const("activation_less_values", default_activation_less_values)

            return tf.cast(tf.math.less(activation_input, activation_less_values), tf.float32)

        def _native_greater_activation(activation_input):
            default_activation_greater_values = np.array(0.0, dtype=np.float32) if self.force_weightless_model else None
            activation_greater_values = self._get_const("activation_greater_values", default_activation_greater_values)

            return tf.cast(tf.math.greater(activation_input, activation_greater_values), tf.float32)

        def _native_pow_activation(activation_input):
            pow_exponent = self._get_const("pow_exponent", 1.0)
            return tf.cast(tf.pow(activation_input, pow_exponent), tf.float32)

        def _native_hardsigmoid_activation(activation_input):
            # each framework has its own default alpha and beta, default value is determined by the framework parser
            alpha = self._get_const("hardsigmoid_alpha", 1.0)
            beta = self._get_const("hardsigmoid_beta", 0.5)
            return tf.math.maximum(
                tf.constant([0.0]),
                tf.math.minimum(tf.constant(alpha) * activation_input + tf.constant(beta), tf.constant([1.0])),
            )

        def _native_clip_activation(activation_input):
            return tf.math.maximum(
                self._get_const("clip_min"),
                tf.math.minimum(activation_input, self._get_const("clip_max")),
            )

        def _native_inv_sqrt_activation(activation_input):
            return 1 / tf.sqrt(activation_input)

        native_activations = {
            ActivationType.linear: _linear_activation,
            ActivationType.relu: tf.nn.relu,
            ActivationType.leaky: _native_leaky_relu,
            ActivationType.relu6: tf.nn.relu6,
            ActivationType.elu: tf.nn.elu,
            ActivationType.sigmoid: tf.nn.sigmoid,
            ActivationType.exp: tf.exp,
            ActivationType.tanh: tf.tanh,
            ActivationType.threshold: _native_threshold_activation,
            ActivationType.biased_delta: _native_bias_delta_activation,
            ActivationType.softplus: tf.nn.softplus,
            ActivationType.silu: tf.nn.silu,
            ActivationType.gelu: tf.nn.gelu,
            ActivationType.mish: _native_mish_activation,
            ActivationType.inv_pos: _native_inv_pos_activation,
            ActivationType.prelu: _native_prelu_activation,
            ActivationType.hardswish: _native_hardswish_activation,
            ActivationType.swish: _native_swish_activation,
            ActivationType.sqrt: tf.sqrt,
            ActivationType.relu1: _native_relu1_activation,
            ActivationType.less: _native_less_activation,
            ActivationType.log: tf.math.log,
            ActivationType.hardsigmoid: _native_hardsigmoid_activation,
            ActivationType.clip: _native_clip_activation,
            ActivationType.inv_sqrt: _native_inv_sqrt_activation,
            ActivationType.greater: _native_greater_activation,
            ActivationType.pow: _native_pow_activation,
        }

        if self.target in [
            EmulationInferenceTargets.SDK_NATIVE,
            EmulationInferenceTargets.SDK_FP_OPTIMIZED,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
        ]:
            activation_op = native_activations[activation](inp)
            if self._enable_clipping:
                return self._build_clipped_activation(l_desc, activation_op)
            return activation_op
        elif self.target in [EmulationInferenceTargets.SDK_FINE_TUNE]:
            if self.fine_tune_params.should_quantize_activations and l_desc.name not in self.mixed_native_layers:
                output_activation_bits = inter_layer_precision_mode.output_activation_bits
                native_activation = native_activations[activation]
                return self._build_fine_tune_activation(inp, l_desc, output_activation_bits, native_activation)
            else:
                return native_activations[activation](inp)
        elif self.target in [EmulationInferenceTargets.SDK_NUMERIC] or self.target in [
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
        ]:
            return self._build_piecewise_activation(
                inp,
                activation,
                l_desc,
                inter_layer_precision_mode,
            )
        else:
            raise BackendNotImplementedError(f"Target {self.target} is not supported in Emulator")

    def _build_clipped_activation(self, l_desc, activation):
        hist_range = tf.Variable(initial_value=[0.0, 1.0], name="activation_clipping_hist_range")
        hist_nbins = tf.Variable(initial_value=10, name="activation_clipping_hist_nbins")
        hist = tf.histogram_fixed_width(
            activation,
            value_range=hist_range,
            nbins=hist_nbins,
            name="activation_hist",
            dtype=tf.int64,
        )
        self._activation_histograms[l_desc.name] = hist

        acv = tf.Variable(initial_value=[-np.inf, np.inf], name="activation_clipping_values")
        self._add_variables_to_initialize(l_desc.name, [acv, hist_range, hist_nbins])
        return tf.clip_by_value(
            activation,
            clip_value_min=acv[0],
            clip_value_max=acv[1],
            name="clipped_activation",
        )

    # TODO: take HW consts properly (SDK-8991)
    def _build_fine_tune_weights(
        self,
        k,
        l_desc,
        inter_layer_precision_mode,
        out_stage_mantissa_size=10,
        simulate_nudging=False,
    ):
        weight_bits = inter_layer_precision_mode.weight_bits
        use_4bit_weights = (
            False
            if l_desc.precision_config.precision_mode is None
            else l_desc.precision_config.precision_mode.reduce()
            in [
                PrecisionMode.a8_w4,
                PrecisionMode.a8_w4_exp,
            ]
        )
        k_orig = k
        output_features = (
            l_desc.output_width
            if l_desc.op in {LayerType.normalization, LayerType.conv} and l_desc.transpose_output_width_features
            else l_desc.output_features
        )
        kernmax = tf.reduce_max(input_tensor=tf.abs(k), name="kmax")
        kernmax = tf.broadcast_to(kernmax, [output_features], name="broadcast_kmax")
        min_vals_per_channel = tf.fill([output_features], -np.inf)
        max_vals_per_channel = tf.fill([output_features], np.inf)
        wcv = tf.Variable(
            initial_value=[min_vals_per_channel, max_vals_per_channel],
            trainable=False,
            name="weights_clipping_values",
        )
        kernmax = tf.math.minimum(wcv[1], kernmax, name="kernmax")
        kernmin = tf.math.maximum(wcv[0], -kernmax, name="kernmin")
        self._add_variables_to_initialize(l_desc.name, [wcv])

        kernel_delta = self._kernel_delta_variables[l_desc.name]
        k_tuned = k + kernel_delta

        if use_4bit_weights:
            self._logger.debug(f"Reducing {l_desc.name} to {weight_bits} bits")
        if weight_bits == 16:
            weight_bits -= int(np.log2(self._get_const("bias_factor", translated=True)))

        # Support "alpha-blending" - Annealed linear combination of quantized and original kernel.
        # ( https://arxiv.org/pdf/1903.01061.pdf ; ARM, 2019,  "Learning low-precision neural networks without Straight-Through Estimator (STE)" )
        #  externally triggered by client setting non-zero alpha via feed_dict (hence the need for the export)
        alpha = tf.constant(0, dtype=tf.float32, name="alpha")
        self._ft_alpha_tensors[l_desc.name] = alpha

        if self.fine_tune_params.should_relax_weights and use_4bit_weights:
            k_blended = self._build_relaxed_quant_weights(l_desc, k_orig, kernel_delta, tf.reduce_max(kernmax), alpha)
        else:
            # The standard STE ("Straight-Through Estimator") approach
            # if quant_groups and quant_groups > 1:
            #     def fqfunc(_k, **kwargs):
            #         _k = tf.squeeze(_k)  # e.g. remove last 1-dim in SA-BN
            #         _k = tf.quantization.fake_quant_with_min_max_vars_per_channel(_k, **kwargs)
            #         return tf.reshape(_k, k_orig.shape)
            # else:
            #     fqfunc = tf.quantization.fake_quant_with_min_max_vars

            k_fq = tf.quantization.fake_quant_with_min_max_vars(
                k_tuned,
                min=tf.reduce_min(kernmin),
                max=tf.reduce_max(kernmax),
                num_bits=weight_bits,
                narrow_range=True,
                name=f"{l_desc.name}_fine_tune_quant_weight",
            )

            # Replace by AlphaBlend with "fake-quant" turned into "real-quant" by stopping gradient for alpha!=0
            k_fq_grad_if_no_alpha = tf.cond(
                pred=tf.equal(alpha, 0.0),
                true_fn=lambda: k_fq,
                false_fn=lambda: tf.stop_gradient(k_fq),
            )
            # (!!) Note we'r    e falling back to STE when alpha is zero, which is both the default,
            #      and (usually) the endpoint of decay - enabling AlphaBlend as a "warmup" for STE
            k_blended = alpha * k + (1 - alpha) * k_fq_grad_if_no_alpha

        # We here help the client-side with train-to-test consistency,
        #  by exporting the "final" kernel as used later in CONV (or other) operation, after all transforms, s.a.:
        #  adding the trainable delta, quantizing, clipping, blending, scaling , etc. etc. including future ones.. (!)
        k_final = tf.add(k_blended, 0.0, name="final_kernel")
        self._ft_final_kernel_tensors[l_desc.name] = k_final

        return k_final

    def _build_relaxed_quant_weights(self, l_desc, k_orig, kernel_delta, kernmax, alpha, bits=4):
        """
        Relaxed Quantization (inspired by https://arxiv.org/pdf/2004.10568.pdf ; QC, 2020)

        Note that 'alpha' here is used in a different sense from AlphaBlend,
        just to blend between clipped and not-clipped - smoothing the clipping so it be gradual too..
        """
        bins_1sided = 2 ** (bits - 1) - 1  # 7
        q_o = k_orig * bins_1sided / kernmax
        q_delta = kernel_delta * bins_1sided / kernmax

        # q_mod = q_o + q_delta
        q_mod = tf.clip_by_value(q_o + q_delta, tf.floor(q_o), tf.floor(q_o) + 1)  # WTF why it helps?!

        q_mod_cl = tf.clip_by_value(q_mod, -bins_1sided, bins_1sided)
        q_mod_cl_frac = tf.subtract(q_mod_cl, tf.floor(q_mod_cl))

        # Export the "fractional part" tensor to be used on the client side for the "relaxation loss":
        q_mod_cl_frac = tf.add(q_mod_cl_frac, 0, name="q_mod_cl_frac")
        self._ft_kernel_frac_part_tensors[l_desc.name] = q_mod_cl_frac

        k = q_mod / bins_1sided * kernmax
        k_cl = q_mod_cl / bins_1sided * kernmax
        return alpha * k + (1 - alpha) * k_cl

    def _build_fine_tune_activation(self, inp, l_desc, activation_bits, native_activation_callback):
        limvals_out_min, limvals_out_max = self._get_const("limvals_out", translated=True)
        after_native_act = native_activation_callback(inp)
        if (limvals_out_min < 0 and limvals_out_max < 0) or (limvals_out_min > 0 and limvals_out_max > 0):
            raise BackendEmulatorException(
                "Can't fake quant properly because TF got undefined behavior "
                "when doing fake-quant with same-sign limits",
            )
            # TODO work around this
        return tf.quantization.fake_quant_with_min_max_args(
            after_native_act,
            min=limvals_out_min,
            max=limvals_out_max,
            num_bits=activation_bits,
            name=f"{l_desc.name}_fine_tune_quant_activation",
        )

    def _build_batchnorm(self, l_desc, inp, inter_layer_precision_mode, elementwise=None):
        if len(l_desc.input_shape) == 2:
            inp = tf.reshape(inp, shape=[-1, 1, 1, l_desc.input_shape[-1]])

        k, k_var = self._build_kernel(tf.Variable(tf.zeros([1, 1, l_desc.input_shape[-1]]), name="kernel"), l_desc)
        self._kernel_variables[l_desc.name] = k_var
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            k = tf.expand_dims(k, 0)
        else:
            k = tf.expand_dims(k, 3)

        bn = self._build_1x1_depthwise(inp, k, l_desc, inter_layer_precision_mode, "bn_op", elementwise)

        if len(l_desc.input_shape) == 2:
            bn = tf.reshape(bn, shape=[-1, l_desc.input_shape[-1]])

        return bn, k

    def _build_normalization(self, l_desc, inp, inter_layer_precision_mode, elementwise=None):
        if len(l_desc.input_shape) == 2:
            inp = tf.reshape(inp, shape=[-1, 1, 1, l_desc.input_shape[-1]])

        k, k_var = self._build_kernel(tf.Variable(tf.zeros([1, 1, l_desc.input_shape[-1], 1]), name="kernel"), l_desc)
        self._kernel_variables[l_desc.name] = k_var

        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            k = tf.transpose(a=k, perm=[0, 1, 3, 2])

        normalization_op = self._build_1x1_depthwise(
            inp,
            k,
            l_desc,
            inter_layer_precision_mode,
            "normalization_op",
            elementwise,
        )
        if len(l_desc.input_shape) == 2:
            normalization_op = tf.reshape(normalization_op, shape=[-1, l_desc.input_shape[-1]])

        return normalization_op, k

    def _build_standalone_activation(self, l_desc, inp, inter_layer_precision_mode):
        if len(inp.shape) == 2:
            inp_shape = inp.shape
            inp = tf.reshape(inp, [-1, 1, 1, inp_shape[1]])

        if self.target in [
            EmulationInferenceTargets.SDK_NATIVE,
            EmulationInferenceTargets.SDK_FP_OPTIMIZED,
            EmulationInferenceTargets.SDK_FINE_TUNE,
        ]:
            return inp, None
        k, k_var = self._build_kernel(tf.Variable(tf.zeros([1, 1, l_desc.input_shape[-1], 1]), name="kernel"), l_desc)
        self._kernel_variables[l_desc.name] = k_var
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            k = tf.transpose(a=k, perm=[0, 1, 3, 2])

        activation_op = self._build_1x1_depthwise(
            inp,
            k,
            l_desc,
            inter_layer_precision_mode,
            "standalone_activation_op",
        )
        return activation_op, k

    def _build_1x1_depthwise(self, inp, kernel, l_desc, inter_layer_precision_mode, name, elementwise=None):
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            is_16bit_precision_mode = self._is_16bit_input(l_desc)
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                kernel = tf.cast(kernel, tf.int32)
            if elementwise is None:
                elementwise = tf.zeros(
                    [1], tf.int32 if self._is_int32_numeric(l_desc, is_16bit_precision_mode) else tf.float32
                )
            elif self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                elementwise = tf.cast(elementwise, tf.int32)
            accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, is_16bit_precision_mode)
            kernel_shape = (
                kernel.shape
                if l_desc.op in [LayerType.activation, LayerType.proposal_generator]
                else l_desc.kernel_shape
            )
            op = self._build_numeric_conv(
                l_desc,
                inp,
                kernel,
                kernel_shape,
                elementwise,
                accumulator_size,
                [1, 1, 1, 1],
                self._get_dilation(l_desc),
                self.hsim.h_depth_wise,
                name=name,
            )
        else:
            if self.target in [EmulationInferenceTargets.SDK_FINE_TUNE]:
                with tf.compat.v1.variable_scope("fine_tune_weights"):
                    kernel_delta = tf.Variable(tf.zeros(tf.shape(kernel)), dtype=tf.float32, name="kernel_delta")
                self._kernel_delta_variables[l_desc.name] = kernel_delta
                if self.fine_tune_params.should_quantize_weights and l_desc.name not in self.mixed_native_layers:
                    kernel = self._build_fine_tune_weights(kernel, l_desc, inter_layer_precision_mode)
            op = tf.compat.v1.nn.depthwise_conv2d_native(inp, kernel, [1, 1, 1, 1], "VALID", name=name)
        return op

    def _build_add(self, l_desc, inp, second_inp):
        with tf.compat.v1.variable_scope("elementwise_addition"):
            # scale = tf.Variable(initial_value=0, dtype=tf.float32, trainable=False, name='scale')
            # return tf.add(inp, tf.scalar_mul(2.0 ** scale, second_inp))
            return tf.add(inp, second_inp)

    def _build_standalone_ew_add_sub(
        self,
        l_desc,
        inputs,
        inter_layer_precision_mode,
        operation,
        inputs_partition=None,
    ):
        features = l_desc.output_shape[-1]

        for i, repeats in enumerate(l_desc.input_repeats):
            for dim, r in enumerate(repeats):
                inputs[i] = tf.repeat(inputs[i], r, axis=dim + 1)

        original_inputs_number = len(inputs)
        if inputs_partition is None:
            inputs_partition = range(original_inputs_number)
        assert len(inputs_partition) == 2
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC]:
            weights = self._get_variable_with_reuse(
                "kernel",
                tf.random.normal([len(inputs) * features], stddev=0.35),
                name="kernel",
                shape=tf.TensorShape(None),
            )

            splitted_weights = tf.split(weights, original_inputs_number, axis=0)
            weights = tf.concat([splitted_weights[inputs_partition[0]], splitted_weights[inputs_partition[1]]], axis=0)

            repeat_count = (features * 2) / tf.shape(weights)[0]
            weights = tf.repeat(weights, tf.cast(repeat_count, tf.int32))

            is_16bit_precision_mode = self._is_16bit_input(l_desc)
            accumulator_size = self.accumulator_size
            if self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                weights = tf.cast(weights, tf.int32)
                accumulator_size = self._get_accumulator_size(inter_layer_precision_mode, is_16bit_precision_mode)
            ew_add = self.hsim.h_standalone_elementwise_add(
                inputs[inputs_partition[0]],
                inputs[inputs_partition[1]],
                weights,
                self.emulation_mult_shift,
                accumulator_size=accumulator_size,
                use_fp16_acc=False,
                name="standalone_elemenwise_op",
            )
        elif self.target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC:
            weights = tf.Variable(tf.random.normal([len(inputs) * features], stddev=0.35), name="kernel")

            first_input_weights = weights[:features]
            second_input_weights = weights[features:]
            in_mult0 = inputs[0] * first_input_weights
            in_mult1 = inputs[1] * second_input_weights
            # when type is ew_sub, second_input_weights is negative.
            ew_add = tf.add(in_mult0, in_mult1)
        elif operation == LayerType.ew_sub:
            ew_add = tf.subtract(inputs[0], inputs[1])
        else:
            ew_add = tf.add(inputs[0], inputs[1])

        return ew_add

    def _ew_mult_bankers_rounding(self, input):
        if self.is_mercury_arch:
            # Due to a hw bug - we are reading the result as int17 (instead of int18)
            input = tf.cast(input, tf.int32)
            unsigned_input = tf.bitwise.bitwise_and(input, (2**16) - 1)
            sign = tf.bitwise.bitwise_and(input, 2**16)
            input = tf.subtract(unsigned_input, sign)
            input = tf.cast(input, tf.float32)
            add_one = tf.equal(input % 4, 3)
            return tf.floor(input / 2) + tf.cast(add_one, tf.float32)
        else:
            add_one = tf.logical_or(tf.equal(input % 4, 3), tf.equal(input % 8, 6))
            return tf.floor(input / 4) + tf.cast(add_one, tf.float32)

    def _round_input_9bit(self, input):
        # this func returns the 9 LSB's of input
        input = tf.cast(input, tf.int32)
        unsigned_input = tf.bitwise.bitwise_and(input, 255)
        sign = tf.bitwise.bitwise_and(input, 256)
        input = tf.subtract(unsigned_input, sign)
        return tf.cast(input, tf.float32)

    def _get_ew_inputs(self, inputs, l_desc):
        bias_in_a = tf.Variable(0, dtype=tf.float32, name="bias_in_a")
        bias_in_b = tf.Variable(0, dtype=tf.float32, name="bias_in_b")
        weights_a = tf.reshape(
            tf.Variable(
                tf.random.normal([1, 1, inputs[0].shape[-1], 1], stddev=0.35),
                name="kernel_a",
                shape=tf.TensorShape(None),
            ),
            [-1],
        )[0]
        weights_b = tf.reshape(
            tf.Variable(
                tf.random.normal([1, 1, inputs[1].shape[-1], 1], stddev=0.35),
                name="kernel_b",
                shape=tf.TensorShape(None),
            ),
            [-1],
        )[0]
        shift = tf.cast(self.emulation_mult_shift, np.float32)
        input_a = inputs[0] * (weights_a / shift) + bias_in_a
        input_b = inputs[1] * (weights_b / shift) + bias_in_b
        input_a = self._round_input_9bit(input_a)
        input_b = self._round_input_9bit(input_b)
        return input_a, input_b

    def _ew_mult_add_zp_to_tensor(self, tensor, x, is_input_tensor):
        x_tensor = tf.fill(tf.shape(tensor), x)
        if is_input_tensor:
            tensors = [tensor, tensor, x_tensor]
        else:
            tensors = [tensor, x_tensor, tensor]
        flattened_tensors = [tf.reshape(tensor, [-1]) for tensor in tensors]
        stacked_tensor = tf.stack(flattened_tensors, axis=0)
        interleaved_tensor = tf.transpose(stacked_tensor)
        return tf.reshape(interleaved_tensor, [-1, tensor.shape[1] * 3])

    def _build_ew_mult_reduce_sum(self, l_desc, inputs, inter_layer_precision_mode, is_square):
        if not is_square:
            for i, repeats in enumerate(l_desc.input_repeats):
                for dim, r in enumerate(repeats):
                    inputs[i] = tf.repeat(inputs[i], r, axis=dim + 1)
        zp_inputs = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name="zero_point_in_0")
        zp_weights = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name="zero_point_in_1")

        input_a, input_b = inputs[0], inputs[0 if is_square else 1]
        features_in_group = l_desc.input_features // l_desc._reduce_sum_groups

        # move all pixels and groups to batch
        input_a = tf.cast(tf.reshape(input_a, [-1, features_in_group]), dtype=tf.int32)
        input_b = tf.cast(tf.reshape(input_b, [-1, features_in_group]), dtype=tf.int32)

        # add zps
        input_a = self._ew_mult_add_zp_to_tensor(input_a, -zp_inputs, True)
        input_b = self._ew_mult_add_zp_to_tensor(input_b, -zp_weights, False)

        # reshape to matrices
        input_a = tf.reshape(input_a, [-1, 1, features_in_group * 3])
        input_b = tf.reshape(input_b, [-1, 1, features_in_group * 3])
        input_b = tf.transpose(input_b, perm=[0, 2, 1])

        mult_res = self.hsim.h_matmul(
            input_a,
            input_b,
            self.emulation_mult_shift,
            accumulator_size=self._get_accumulator_size(inter_layer_precision_mode, True),
            use_fp16_acc=False,
            name="ew_mult_on_mac",
        )

        mult_res = tf.reshape(mult_res, [-1, l_desc.input_height, l_desc.input_width, l_desc._reduce_sum_groups])
        mult_res = self._build_bias(l_desc, mult_res, inter_layer_precision_mode)

        return mult_res

    def _build_ew_mult(self, l_desc, inputs):
        for i, repeats in enumerate(l_desc.input_repeats):
            for dim, r in enumerate(repeats):
                inputs[i] = tf.repeat(inputs[i], r, axis=dim + 1)

        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
            input_a, input_b = self._get_ew_inputs(inputs, l_desc)
            ew_mult = tf.multiply(input_a, input_b)
            ew_mult = self._ew_mult_bankers_rounding(ew_mult)
        else:
            ew_mult = tf.multiply(inputs[0], inputs[1])
        return ew_mult

    def build_reduce_mean_or_sum_layer(self, l_desc, inp, inter_layer_precision_mode, tf_reduce_op):
        is_16bit_precision_mode = self._is_16bit_input(l_desc)

        kernel_shape = l_desc.kernel_shape
        if self.target in [EmulationInferenceTargets.SDK_NUMERIC, EmulationInferenceTargets.SDK_PARTIAL_NUMERIC]:
            if len(inp.shape) == 4:
                return self._build_conv(l_desc, [inp], inter_layer_precision_mode, None)
            else:
                k = tf.Variable(tf.constant(4, shape=kernel_shape, dtype=inp.dtype), name="kernel")
                mult = tf.multiply(inp, k[0, 0, 0, 0], name="op")
                if not self._is_int32_numeric(l_desc, is_16bit_precision_mode):
                    reduce_op = tf.divide(mult, tf.cast(self.emulation_mult_shift, inp.dtype))
                else:
                    reduce_op = mult
                reduce_op = tf.cast(reduce_op, inp.dtype)
                op = tf_reduce_op(input_tensor=reduce_op, axis=l_desc.reduce_axes, keepdims=True)
                return op, k
        else:
            if l_desc.groups > 1:
                concat_inputs = []
                input_group_size = int(inp.shape[-1] / l_desc.groups)

                for g in range(l_desc.groups):
                    group_input = inp[:, :, :, g * input_group_size : (g + 1) * input_group_size]
                    reduce_op = tf_reduce_op(
                        input_tensor=group_input,
                        axis=l_desc.reduce_axes,
                        keepdims=True,
                        name=f"reduce_op_{g}",
                    )
                    concat_inputs.append(reduce_op)
                op = tf.concat(concat_inputs, 3, name="op")
            else:
                op = tf_reduce_op(input_tensor=inp, axis=l_desc.reduce_axes, keepdims=True, name="op")
            k = tf.Variable(tf.constant(4, shape=kernel_shape, dtype=tf.float32), name="kernel")
            return op, k

    def _get_dilation(self, l_desc):
        if not hasattr(l_desc, "dilations"):
            return [1, 1, 1, 1]
        if l_desc.op not in [LayerType.conv, LayerType.dw]:
            if l_desc.dilations[1] > 1 or l_desc.dilations[2] > 1:
                raise BackendEmulatorException(f"dilation is not supported in layer type {l_desc.op}")
        return l_desc.dilations

    def _get_strides(self, l_desc):
        if not hasattr(l_desc, "strides"):
            return [1, 1, 1, 1]
        return [l_desc.strides[0], l_desc.strides[1], l_desc.strides[2], l_desc.strides[3]]

    def _bankers_round_int(self, inp, beta, int64_int32_n=False):
        out_pre_round = tf.bitwise.right_shift(inp, beta)
        half_bit = tf.bitwise.bitwise_and(tf.bitwise.right_shift(inp, beta - 1), 1)
        addition_bankers_bits_mask = 2 ** (beta + 1) - 1 - 2 ** (beta - 1)
        addition_result_bankers_bits = tf.bitwise.bitwise_and(inp, addition_bankers_bits_mask)
        if int64_int32_n:
            addition_result_round_up = tf.bitwise.bitwise_and(
                tf.cast(tf.math.greater(addition_result_bankers_bits, 0), tf.int64),
                half_bit,
            )
        else:
            addition_result_round_up = tf.bitwise.bitwise_and(
                tf.cast(tf.math.greater(addition_result_bankers_bits, 0), tf.int32),
                half_bit,
            )
        return out_pre_round + addition_result_round_up, out_pre_round

    def _shift_and_bankers_round(self, inp, shift_no, b_round_bits_no, is_16bit_precision_mode=False):
        if is_16bit_precision_mode or self.is_mercury_arch:
            # B0-1717 work-around pre bankers rounding calculation
            # (input * mantissa) / 2 ** (shift - b_rounds_bits_no) for negative values do
            # floor instead of ceil as in A0 and B0 8 bit precision
            inp = tf.bitwise.right_shift(inp, shift_no - b_round_bits_no)
        else:
            inp = tf.sign(inp) * tf.bitwise.right_shift(tf.abs(inp), shift_no - b_round_bits_no)
        res, _ = self._bankers_round_int(inp, b_round_bits_no, is_16bit_precision_mode)
        return res

    def _build_piecewise_activation(self, inp, activation, l_desc, inter_layer_precision_mode):
        with tf.compat.v1.variable_scope("piecewise"):
            is_16bit_precision_mode = (
                self._is_16bit_input(l_desc)
                or inter_layer_precision_mode.is_mode(16, 16)  # this is 8->16 mode
                or l_desc.op == LayerType.fused_bbox_decoder
            )
            layer_name = "/".join(inp.name.split("/")[:-1])
            num_points = self.activation_points[layer_name] if self.activation_points is not None else 2

            dtype_precision = tf.int32
            dtype_precision_slope = tf.int64 if is_16bit_precision_mode else tf.int32

            quantization_groups = l_desc.precision_config.quantization_groups
            all_pa_x_points = tf.Variable(
                tf.zeros([quantization_groups, num_points], dtype=dtype_precision_slope),
                dtype=dtype_precision_slope,
                trainable=False,
                name="x_points",
            )
            all_pa_offsets = tf.Variable(
                tf.zeros([quantization_groups, num_points + 1], dtype=dtype_precision),
                dtype=dtype_precision,
                trainable=False,
                name="offsets",
            )
            all_pa_slopes_m = tf.Variable(
                tf.zeros([quantization_groups, num_points + 1], dtype=dtype_precision_slope),
                dtype=dtype_precision_slope,
                trainable=False,
                name="slopes_m",
            )
            all_pa_slopes_e = tf.Variable(
                tf.zeros([quantization_groups, num_points + 1], dtype=dtype_precision_slope),
                dtype=dtype_precision_slope,
                trainable=False,
                name="slopes_e",
            )

            size_splits = self._get_const("size_splits", default_value=np.int32(1))
            size_splits = (
                int(size_splits) if isinstance(size_splits, np.int32) else tf.cast(size_splits, dtype=tf.int32)
            )

            size_splits = [tf.shape(splinter)[-1] for splinter in tf.split(inp, size_splits, axis=-1, name="inp_parts")]

            is_8bit_to_16bit_mode = (
                l_desc.precision_config.precision_mode.input_precision_mode() == PrecisionMode.a8_w8_a8
                and l_desc.precision_config.precision_mode.output_precision_mode() == PrecisionMode.a16_w16_a16
            )

            # None to bool
            is_signed_output = not not l_desc.precision_config.signed_output
            return self.hsim.h_activation(
                tf.cast(inp, np.int64),
                tf.cast(all_pa_x_points, np.int64),
                tf.cast(all_pa_offsets, np.int64),
                tf.cast(all_pa_slopes_m, np.int64),
                tf.cast(all_pa_slopes_e, np.int64),
                size_splits=tf.cast(size_splits, np.int64),
                is_8bit_to_16bit_mode=is_8bit_to_16bit_mode,
                is_mercury=self.is_mercury_arch,
                ebias=inter_layer_precision_mode.ebias,
                clip_bits=inter_layer_precision_mode.shifter_bias_max_value,
                beta=inter_layer_precision_mode.beta,
                is_16bit_precision_mode=is_16bit_precision_mode,
                is_signed_output=is_signed_output,
            )

    def _build_output_stage(self, inp, l_desc, inter_layer_precision_mode):
        with tf.compat.v1.variable_scope("output_stage"):
            beta = inter_layer_precision_mode.beta

            out_double_scale = self._build_activation(
                l_desc.activation,
                inp,
                l_desc,
                inter_layer_precision_mode,
            )
            out, out_pre_round = self._bankers_round_int(out_double_scale, beta)

            out_bit_size = inter_layer_precision_mode.output_activation_bits
            out_clipped_vals_min = tf.divide(
                x=tf.math.count_nonzero(tf.less(out, 0)),
                y=tf.cast(tf.size(input=out), tf.int64),
                name="stats_out_clipped_values_min",
            )
            out_clipped_vals_max = tf.divide(
                x=tf.math.count_nonzero(tf.greater_equal(out, 2**out_bit_size)),
                y=tf.cast(tf.size(input=out), tf.int64),
                name="stats_out_clipped_values_max",
            )

            stats = [out_clipped_vals_min, out_clipped_vals_max]
            if not l_desc.precision_config.signed_output:
                out = tf.clip_by_value(out, 0, 2**out_bit_size - 1)
            else:
                max_val = (2 ** (out_bit_size - 1)) - 1
                out = tf.clip_by_value(out, -max_val, max_val)

            out = tf.cast(out, tf.int32) if out_bit_size == 15 else tf.cast(out, tf.float32)

            return out, out_pre_round, stats

    def get_rescaled_output_layers(self, params, debug_rescale_outputs=False):
        inner_results = []
        out_layers = self.hailo_nn.get_real_output_layers()
        out_layers_results = [None] * len(out_layers)
        out_layers.extend(self.hailo_nn.get_output_layers())
        layers = self.get_sorted_inner_layers() if debug_rescale_outputs else out_layers
        with self.g.as_default():
            for output_layer in layers:
                # 1. original_name_scope includes the slash
                # 2. in general, final slash is needed to re-enter a scope,
                #    see: https://github.com/tensorflow/tensorflow/issues/3499
                output_layer_tensors = self.get_tensors([output_layer])
                real_output_layer_tensors = self.get_tensors([output_layer], True)
                scope = self._get_scope_from_the_name_of_tensor_or_layer(output_layer)
                qp_out = self._get_variable_with_scope(
                    variable_name="qp_out",
                    layer_name=output_layer.name_without_scope,
                    scope=scope,
                )
                qp_scale = qp_out[1]
                qp_zp = qp_out[0]
                if output_layer in out_layers and self.hailo_nn.is_output_scale_per_channel():
                    shape = (1, output_layer.output_features)
                    qp_scale = self._get_variable_with_scope(
                        variable_name="layer_params/output_scales",
                        layer_name=output_layer.name_without_scope,
                        scope=scope,
                        shape=shape,
                    )
                    qp_zp = self._get_variable_with_scope(
                        variable_name="layer_params/output_zero_points",
                        layer_name=output_layer.name_without_scope,
                        scope=scope,
                        shape=shape,
                    )

                for output_layer_tensor in output_layer_tensors:
                    layer_name = "/".join(output_layer_tensor.name.split("/")[:2])
                    with tf.compat.v1.variable_scope(layer_name + "/"):
                        add_as_output = output_layer_tensor in real_output_layer_tensors and output_layer in out_layers
                        if not hasattr(params, layer_name):
                            self._logger.warning(f"layer {layer_name} doesn't have qp_out")
                        if output_layer.op == LayerType.argmax:
                            res = tf.multiply(
                                tf.ones(shape=[1], dtype=tf.float32),
                                output_layer_tensor,
                                name="rescaled_out",
                            )
                        else:
                            if output_layer_tensor.dtype != tf.float32:
                                output_layer_tensor = tf.cast(output_layer_tensor, tf.float32)
                            res = tf.multiply(qp_scale, tf.subtract(output_layer_tensor, qp_zp), name="rescaled_out")
                        # To aviod duplications, add output tensor only if is real output
                        if add_as_output:
                            # Preserve output layers order
                            index = out_layers.index(output_layer)
                            out_layers_results[index] = res
                            # Prevent reuse
                            out_layers[index] = None
                        inner_results.append(res)

        return out_layers_results, inner_results

    def _to_numeric(self, inp, qp_in, limvals_in):
        if self.mixed_params.numeric_target == EmulationInferenceTargets.SDK_NUMERIC:
            self.target = EmulationInferenceTargets.SDK_NUMERIC
        elif self.mixed_params.numeric_target == EmulationInferenceTargets.SDK_PARTIAL_NUMERIC:
            self.target = EmulationInferenceTargets.SDK_PARTIAL_NUMERIC
        else:
            raise BackendEmulatorException(
                f"{self.mixed_params.numeric_target} is not supported with the SdkMixed target",
            )

        inp = tf.clip_by_value(inp, limvals_in[0], limvals_in[1])
        return tf.math.rint(tf.add(tf.divide(inp, qp_in[1]), qp_in[0]))

    def _get_variable_with_scope(self, variable_name, layer_name=None, scope=None, shape=2):
        with tf.compat.v1.variable_scope(scope):
            with tf.compat.v1.variable_scope(layer_name, reuse=tf.compat.v1.AUTO_REUSE):
                return tf.compat.v1.get_variable(
                    variable_name,
                    dtype=tf.float32,
                    trainable=False,
                    initializer=tf.zeros(shape),
                )

    def _from_numeric(self, inp):
        self.target = EmulationInferenceTargets.SDK_NATIVE
        qp_out = tf.compat.v1.get_variable("qp_out", dtype=tf.float32, trainable=False, initializer=tf.zeros(2))
        inp = tf.cast(inp, tf.float32)
        return tf.multiply(qp_out[1], tf.subtract(inp, qp_out[0]))

    def prepare_full_hailo_graph_export(self, params=None, should_rescale=False, requires_quantized_weights=False):
        def load_params_func(session, graph):
            return self.load_params()

        hailo_graph_export = HailoGraphExport(
            session=self.s,
            graph=self.g,
            input_tensors=self.get_input_layers(),
            load_params_func=load_params_func,
        )
        output_layers_names = self.get_output_layers_names()
        all_layers_names = self.get_inner_layers_names()

        # Prepare and add output layers and inner layers exports
        output_layers_export = OutputTensorsExport(
            export_level=ExportLevel.OUTPUT_LAYERS,
            tensors=self.get_output_layers(),
            layers_names=output_layers_names,
        )
        hailo_graph_export.add_output_tensors_export(output_layers_export)

        all_layers_export = OutputTensorsExport(
            export_level=ExportLevel.ALL_LAYERS,
            tensors=self.get_inner_layers_output_tensors(),
            layers_names=all_layers_names,
        )
        hailo_graph_export.add_output_tensors_export(all_layers_export)

        # Prepare and add rescaled output layers and inner layers exports
        if should_rescale:
            if params is None:
                raise BackendEmulatorException(
                    "Failed to create emulation graph with rescaled layers, params not found",
                )
            output_layers_rescaled, all_layers_rescaled = self.get_rescaled_output_layers(params, True)

            output_layers_rescaled_export = OutputTensorsExport(
                export_level=ExportLevel.OUTPUT_LAYERS_RESCALED,
                tensors=output_layers_rescaled,
                layers_names=output_layers_names,
            )
            hailo_graph_export.add_output_tensors_export(output_layers_rescaled_export)

            all_layers_rescaled_export = OutputTensorsExport(
                export_level=ExportLevel.ALL_LAYERS_RESCALED,
                tensors=all_layers_rescaled,
                layers_names=all_layers_names,
            )
            hailo_graph_export.add_output_tensors_export(all_layers_rescaled_export)

        elif not requires_quantized_weights and self.target in [
            EmulationInferenceTargets.SDK_NUMERIC,
            EmulationInferenceTargets.SDK_PARTIAL_NUMERIC,
            EmulationInferenceTargets.SDK_DEBUG_PRECISE_NUMERIC,
        ]:
            output_layers_rescaled_export = OutputTensorsExport(
                export_level=ExportLevel.OUTPUT_LAYERS_RESCALED,
                tensors=output_layers_export.tensors,
                layers_names=output_layers_names,
            )
            hailo_graph_export.add_output_tensors_export(output_layers_rescaled_export)

            all_layers_rescaled_export = OutputTensorsExport(
                export_level=ExportLevel.ALL_LAYERS_RESCALED,
                tensors=all_layers_export.tensors,
                layers_names=all_layers_names,
            )
            hailo_graph_export.add_output_tensors_export(all_layers_rescaled_export)

        # Prepare finetune tensors
        hailo_graph_export.add_output_tensors_export(self.prepare_ft_output_export())

        # Prepare and add statistics tensors
        hailo_graph_export.add_output_tensors_export(self.prepare_stats_export())
        hailo_graph_export.add_output_tensors_export(self.prepare_activations_histograms_export())

        # Prepare variables
        hailo_graph_export.add_variables_export(self.prepare_bias_export())
        hailo_graph_export.add_variables_export(self.prepare_bias_delta_export())
        hailo_graph_export.add_variables_export(self.prepare_kernel_variables_export())
        hailo_graph_export.add_variables_export(self.prepare_kernel_delta_variables_export())
        hailo_graph_export.add_variables_export(self.prepare_variables_to_initialize_export())

        hailo_graph_export.add_output_tensors_export(self.prepare_ft_kernel_range_tensors_export())
        hailo_graph_export.add_output_tensors_export(self.prepare_ft_alpha_tensors_export())
        hailo_graph_export.add_output_tensors_export(self.prepare_ft_final_kernel_tensors_export())
        hailo_graph_export.add_output_tensors_export(self.prepare_ft_kernel_frac_part_tensors_export())

        # Prepare and add full graph (operation) exports
        full_export, pre_act_export = self.prepare_full_graph_export()
        hailo_graph_export.add_output_tensors_export(pre_act_export)
        hailo_graph_export.add_output_tensors_export(full_export)
        return hailo_graph_export
