MindSpore CPU Native Builder

Build native CPU kernels directly in the mindspore repository without ATen/libtorch dependency.

When to Use

Use this skill when:

•Implementing kernels in mindspore/ccsrc/plugin/device/cpu/kernel/
•Writing optimized kernels with Eigen or SLEEF
•No external ATen/libtorch dependency is desired
•Full control over implementation is needed
•Building production-optimized kernels

Quick Start

•Identify the operator from mindspore/ops/op_def/yaml/
•Create kernel class in mindspore/ccsrc/plugin/device/cpu/kernel/
•Inherit from CPUKernelMod or NativeCpuKernelMod
•Register with MS_KERNEL_FACTORY_REG
•Add tests in tests/st/ops/cpu/

Instructions

Step 1: Identify the Operator

Find the operator definition:

bash

# Search for operator definition
find mindspore/ops/op_def/yaml -name "*<op_name>*"

Step 2: Create Kernel File

Create mindspore/ccsrc/plugin/device/cpu/kernel/<op_name>_cpu_kernel.cc:

cpp

/**
 * Copyright 2025 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * ...
 */

#include "plugin/device/cpu/kernel/<op_name>_cpu_kernel.h"
#include <algorithm>
#include <functional>
#include "plugin/device/cpu/hal/device/cpu_device_address.h"

namespace mindspore {
namespace kernel {

bool OpNameCpuKernelMod::Init(const std::vector<KernelTensor *> &inputs,
                               const std::vector<KernelTensor *> &outputs) {
  // Initialize kernel parameters
  return true;
}

int OpNameCpuKernelMod::Resize(const std::vector<KernelTensor *> &inputs,
                                const std::vector<KernelTensor *> &outputs) {
  // Handle dynamic shapes
  return KRET_OK;
}

bool OpNameCpuKernelMod::Launch(const std::vector<kernel::KernelTensor *> &inputs,
                                 const std::vector<kernel::KernelTensor *> &workspace,
                                 const std::vector<kernel::KernelTensor *> &outputs) {
  // Get input/output pointers
  auto input = reinterpret_cast<float *>(inputs[0]->device_ptr());
  auto output = reinterpret_cast<float *>(outputs[0]->device_ptr());

  // Implement kernel logic
  size_t elem_num = inputs[0]->size() / sizeof(float);
  for (size_t i = 0; i < elem_num; ++i) {
    output[i] = /* computation */;
  }

  return true;
}

MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, OpName, OpNameCpuKernelMod);

}  // namespace kernel
}  // namespace mindspore

Step 3: Create Header File

Create mindspore/ccsrc/plugin/device/cpu/kernel/<op_name>_cpu_kernel.h:

cpp

/**
 * Copyright 2025 Huawei Technologies Co., Ltd
 * ...
 */

#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_OPNAME_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_OPNAME_CPU_KERNEL_H_

#include <vector>
#include <memory>
#include "plugin/device/cpu/kernel/cpu_kernel.h"
#include "plugin/factory/ms_factory.h"

namespace mindspore {
namespace kernel {

class OpNameCpuKernelMod : public NativeCpuKernelMod {
 public:
  OpNameCpuKernelMod() = default;
  ~OpNameCpuKernelMod() override = default;

  bool Init(const std::vector<KernelTensor *> &inputs,
            const std::vector<KernelTensor *> &outputs) override;

  int Resize(const std::vector<KernelTensor *> &inputs,
             const std::vector<KernelTensor *> &outputs) override;

  bool Launch(const std::vector<KernelTensor *> &inputs,
              const std::vector<KernelTensor *> &workspace,
              const std::vector<KernelTensor *> &outputs) override;

 protected:
  std::vector<KernelAttr> GetOpSupport() override {
    static std::vector<KernelAttr> support_list = {
      KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
      KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
    };
    return support_list;
  }

 private:
  // Kernel parameters
};

}  // namespace kernel
}  // namespace mindspore

#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_OPNAME_CPU_KERNEL_H_

Step 4: Using Eigen for Optimization

For vectorized operations, use Eigen:

cpp

#include "Eigen/Core"

bool OpNameCpuKernelMod::Launch(...) {
  auto input = reinterpret_cast<float *>(inputs[0]->device_ptr());
  auto output = reinterpret_cast<float *>(outputs[0]->device_ptr());
  size_t elem_num = inputs[0]->size() / sizeof(float);

  // Eigen map for vectorized operations
  Eigen::Map<Eigen::ArrayXf> input_array(input, elem_num);
  Eigen::Map<Eigen::ArrayXf> output_array(output, elem_num);

  // Vectorized computation
  output_array = input_array.sin();  // Example: element-wise sin

  return true;
}

Step 5: Using SLEEF for SIMD

For explicit SIMD with SLEEF:

cpp

#include "sleef.h"

bool OpNameCpuKernelMod::Launch(...) {
  auto input = reinterpret_cast<float *>(inputs[0]->device_ptr());
  auto output = reinterpret_cast<float *>(outputs[0]->device_ptr());
  size_t elem_num = inputs[0]->size() / sizeof(float);

  // Process 8 floats at a time with AVX
  size_t i = 0;
  for (; i + 8 <= elem_num; i += 8) {
    __m256 x = _mm256_loadu_ps(input + i);
    __m256 y = Sleef_sinf8_u10(x);  // SLEEF vectorized sin
    _mm256_storeu_ps(output + i, y);
  }

  // Handle remainder
  for (; i < elem_num; ++i) {
    output[i] = sinf(input[i]);
  }

  return true;
}

Step 6: Multi-threaded Execution

Use OpenMP for parallelization:

cpp

#include <omp.h>

bool OpNameCpuKernelMod::Launch(...) {
  auto input = reinterpret_cast<float *>(inputs[0]->device_ptr());
  auto output = reinterpret_cast<float *>(outputs[0]->device_ptr());
  size_t elem_num = inputs[0]->size() / sizeof(float);

  #pragma omp parallel for
  for (size_t i = 0; i < elem_num; ++i) {
    output[i] = /* computation */;
  }

  return true;
}

Step 7: Write Tests

Create tests/st/ops/cpu/test_<op_name>_op.py:

python

import pytest
import numpy as np
import mindspore as ms
from mindspore import ops, Tensor
import torch


@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
def test_op_name_float32():
    """Test op_name with float32."""
    ms.set_context(mode=ms.PYNATIVE_MODE, device_target="CPU")

    # Input data
    input_np = np.random.randn(3, 4).astype(np.float32)

    # MindSpore
    ms_input = Tensor(input_np)
    ms_output = ops.op_name(ms_input)

    # PyTorch reference
    torch_input = torch.from_numpy(input_np)
    torch_output = torch.op_name(torch_input)

    # Compare
    assert np.allclose(ms_output.asnumpy(), torch_output.numpy(), rtol=1e-5, atol=1e-8)

Step 8: Build and Test

bash

cd mindspore
bash build.sh -e cpu -j 16

# Run tests
pytest tests/st/ops/cpu/test_<op_name>_op.py -v

Key Locations

Component	Location
Kernels	`mindspore/ccsrc/plugin/device/cpu/kernel/`
Base class	`mindspore/ccsrc/plugin/device/cpu/kernel/cpu_kernel.h`
Eigen	`third_party/eigen/`
Tests	`tests/st/ops/cpu/`

Test Coverage Checklist

• All supported dtypes (float16, float32, float64, int32, int64)
• Various tensor shapes (0D to 8D)
• Empty tensors
• Non-contiguous tensors
• Large tensors (stress test)
• Numerical accuracy vs PyTorch
• Edge cases (NaN, Inf, zeros)

Performance Tips

•Use Eigen for automatic vectorization
•Use SLEEF for explicit SIMD control
•Use OpenMP for multi-threading
•Minimize memory allocation in Launch()
•Preallocate workspace in Resize()
•Cache kernel parameters in Init()

Troubleshooting

Kernel not found: Check MS_KERNEL_FACTORY_REG registration

Type mismatch: Verify GetOpSupport() includes all needed types

Performance issues: Profile with perf or Intel VTune

References

•mindspore/ccsrc/plugin/device/cpu/kernel/ - Existing kernel implementations
•third_party/eigen/ - Eigen documentation
•MindSpore CPU kernel development guide (internal docs)