why there's nerual engine-data copy in coreml npu prediction

I am currently facing a performance issue while using CoreML on iOS 16+ devices to run a simple grid_sample model. When profiling the model using xcode Profiler, I noticed that before each NPU computation, there is a significant delay caused by the "input copy" and "neural engine-data copy" operations.I have specified that both the input and output of the model are of type float16, there shouldn't be any data type convert.

I would appreciate any insights or suggestions regarding the reasons behind this delay and possible solutions

My simple model is

class GridSample(torch.nn.Module):
    def __init__(
        self,
    ):
        super().__init__()

    def forward(self, input: torch.Tensor, grid: torch.Tensor) -> torch.Tensor:
        output = F.grid_sample(
            input, grid.to(input), mode='nearest', padding_mode='zeros', align_corners=True,
        )
        return output

tr_input = torch.randn((8, 64, 512, 512)
tr_grid = torch.randn((8, 256, 256, 2)
simple_model = GridSample()
simple_model.eval()
traced_model = torch.jit.trace(simple_model, [tr_input, tr_grid])

coreml_input = [coremltools.TensorType(name="image_input", shape=tr_input.shape, dtype=np.float16), coremltools.TensorType(name="warp_grid", shape=tr_grid.shape, dtype=np.float16)]
mlmodel = coremltools.converters.convert(traced_model, inputs=coreml_input,
                                        convert_to="mlprogram",
                                        minimum_deployment_target=coremltools.target.iOS16,
                                        compute_units=coremltools.ComputeUnit.ALL,
                                        compute_precision = coremltools.precision.FLOAT16,
                                        outputs=[ct.TensorType(name="x0", dtype=np.float16)],
                                        debug=False)
mlmodel.save("./grid_sample.mlpackage")
os.system(f"xcrun coremlcompiler compile './grid_sample.mlpackage' './')