2025-03-26
HAMi (Heterogeneous AI Computing Virtualization Middleware) is a middleware solution for managing heterogeneous devices in Kubernetes clusters. It provides a comprehensive framework for managing and scheduling accelerators like GPUs and FPGAs in Kubernetes environments. For a detailed introduction to HAMi's capabilities and features, see our Project Overview and Why Kubernetes Can't Meet AI Scheduling Needs.
Core components of HAMi codebase:
cmd/
├── device-plugin/ # Device plugin components
│ ├── main.go # Entry point
│ └── nvidia/ # NVIDIA device plugin implementation
│
├── scheduler/ # Scheduler components
│ ├── main.go # Scheduler entry point
│ └── metrics.go # Monitoring metrics implementation
│
└── vGPUmonitor/ # GPU monitoring components
├── main.go # Monitoring service entry
├── metrics.go # Metrics definition and collection
├── feedback.go # Feedback mechanism
├── validation.go # Validation logic
├── build.sh # Build script
├── noderpc/ # Node RPC communication
└── testcollector/ # Test collector
pkg/
├── device/ # Core device management
│ ├── devices.go # Unified device interface
│ ├── devices_test.go # Device interface tests
│ ├── nvidia/ # NVIDIA GPU implementation
│ ├── cambricon/ # Cambricon device implementation
│ ├── ascend/ # Huawei Ascend implementation
│ ├── hygon/ # Hygon device implementation
│ ├── iluvatar/ # Iluvatar CoreX implementation
│ ├── metax/ # MetaX device implementation
│ └── mthreads/ # Moore Threads implementation
│
├── scheduler/ # Scheduling system
│ ├── config/ # Configuration management
│ ├── policy/ # Scheduling policies
│ ├── routes/ # Routing rules
│ ├── scheduler.go # Scheduler core
│ ├── nodes.go # Node management
│ ├── pods.go # Pod management
│ ├── score.go # Scoring system
│ ├── event.go # Event handling
│ ├── webhook.go # Webhook integration
│ └── *_test.go # Corresponding test files
│
├── device-plugin/ # Device plugin implementation
├── k8sutil/ # Kubernetes utilities
├── monitor/ # Monitoring system
├── oci/ # OCI runtime integration
├── util/ # Common utilities
└── version/ # Version management
pkg/device
)Our device management system supports various hardware vendors including Cambricon, Iluvatar CoreX, and others through a unified interface. For detailed compatibility information, check our GPU Virtualization Technology Guide.
// pkg/device/devices.go
type Devices interface {
CommonWord() string
GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, error)
CheckHealth(devType string, n *corev1.Node) (bool, bool)
GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest
LockNode(n *corev1.Node, p *corev1.Pod) error
ReleaseNodeLock(n *corev1.Node, p *corev1.Pod) error
ScoreNode(node *corev1.Node, podDevices util.PodSingleDevice, policy string) float32
CustomFilterRule(...) bool
}
// pkg/device/nvidia/device.go
type NvidiaDevices struct {
resourceName string
}
func (dev *NvidiaDevices) GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, error) {
// NVIDIA device discovery implementation
}
// pkg/device/metax/device.go
type MetaxDevices struct {}
const (
MetaxGPUDevice = "Metax"
MetaxSGPUDevice = "Metax-SGPU"
)
func (dev *MetaxDevices) GetNodeDevices(n corev1.Node) ([]*util.DeviceInfo, error) {
// MetaX device discovery implementation
}
pkg/device-plugin
)For a detailed analysis of our device plugin implementation, see our Device Plugin Analysis and Webhook Analysis.
// pkg/device-plugin/server/server.go
type DevicePluginServer struct {
devicePlugin v1beta1.DevicePluginServer
}
func (s *DevicePluginServer) ListAndWatch(empty *pluginapi.Empty, stream pluginapi.DevicePlugin_ListAndWatchServer) error {
// Device monitoring and update implementation
}
// pkg/device-plugin/nvidia/plugin.go
type NvidiaDevicePlugin struct {
devices []*pluginapi.Device
server *grpc.Server
}
func (p *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
// Device allocation logic implementation
}
pkg/scheduler
)// pkg/scheduler/policy/policy.go
type SchedulerPolicy interface {
Filter(pod *corev1.Pod, node *corev1.Node) bool
Score(pod *corev1.Pod, node *corev1.Node) int64
}
// pkg/scheduler/policy/binpack.go
type BinpackPolicy struct{}
func (p *BinpackPolicy) Score(pod *corev1.Pod, node *corev1.Node) int64 {
// Binpack policy scoring implementation
}
// pkg/scheduler/nodes.go
type NodeManager struct {
nodes map[string]*NodeInfo
}
func (nm *NodeManager) UpdateNode(node *corev1.Node) {
// Node status update implementation
}
pkg/monitor
)Our monitoring system provides comprehensive metrics collection and analysis. For real-world testing results, see our HAMi Isolation Test Report.
// pkg/monitor/monitor.go
type DeviceMonitor interface {
GetMetrics() (*DeviceMetrics, error)
WatchDevices() (<-chan *DeviceEvent, error)
}
// pkg/monitor/nvidia/monitor.go
type NvidiaMonitor struct {
devices []*nvml.Device
}
func (m *NvidiaMonitor) GetMetrics() (*DeviceMetrics, error) {
// NVIDIA metrics collection implementation
}
pkg/oci
)// pkg/oci/runtime.go
type Runtime interface {
CreateContainer(config *RuntimeConfig) error
StartContainer(id string) error
StopContainer(id string) error
}
// pkg/oci/device.go
func ConfigureDeviceMounts(spec *specs.Spec, devices []string) error {
// Device mount configuration implementation
}
pkg/k8sutil
)// pkg/k8sutil/pod.go
func UpdatePodStatus(client kubernetes.Interface, pod *corev1.Pod) error {
// Pod status update implementation
}
// pkg/k8sutil/resource.go
func ValidateResourceRequirements(pod *corev1.Pod) error {
// Resource validation implementation
}
pkg/util
)// pkg/util/nodelock/lock.go
type NodeLock struct {
name string
namespace string
}
func (l *NodeLock) Lock() error {
// Distributed lock implementation
}
// pkg/util/flag/flag.go
func ParseFlags() (*Config, error) {
// Configuration parsing implementation
}
Main test file distribution:
pkg/
├── device/
│ └── devices_test.go # Device management tests
├── scheduler/
│ └── scheduler_test.go # Scheduler tests
├── monitor/
│ └── monitor_test.go # Monitoring system tests
└── util/
└── util_test.go # Utility function tests
Core dependency configuration:
// go.mod
module github.com/Project-HAMi/HAMi
require (
k8s.io/api v0.24.0
k8s.io/apimachinery v0.24.0
k8s.io/client-go v0.24.0
k8s.io/klog/v2 v2.60.1
)
Next Version: v2.6.0
For our latest releases and features, check out:
Tasks:
For detailed information: HAMi RoadMap