Skip to content

Instantly share code, notes, and snippets.

@garyblankenship
Created April 20, 2025 23:21
Show Gist options
  • Save garyblankenship/fe0a1a7361db40631418fe6d6bcfbffc to your computer and use it in GitHub Desktop.
Save garyblankenship/fe0a1a7361db40631418fe6d6bcfbffc to your computer and use it in GitHub Desktop.
mlx-lm cheatsheet

MLX Framework Cheatsheet

Overview

MLX is an array framework for machine learning on Apple silicon, designed by Apple machine learning research. It offers high performance, familiar APIs, and seamless integration with Apple's ecosystem.

Core Features

  • Familiar APIs: Python API based on NumPy, with C++, Swift interfaces
  • Composable function transformations: For automatic differentiation, vectorization, optimization
  • Lazy computation: Arrays only materialized when needed
  • Dynamic graph construction: No slow recompilations when shapes change
  • Unified memory model: Operations across devices without data copies

Installation

# Install MLX
pip install mlx

# Install MLX-LM for language models
pip install mlx-lm

MLX Core Components

Arrays and Basic Operations

import mlx.core as mx

# Create arrays
a = mx.array([1, 2, 3])
b = mx.zeros((3, 3))
c = mx.ones((2, 4))
d = mx.random.normal((2, 2))

# Basic operations
result = a + b
result = mx.matmul(b, b)

# Evaluate lazily computed arrays
mx.eval(result)

Function Transformations

import mlx.core as mx

# Gradient computation
def f(x):
    return mx.sum(x ** 2)

grad_f = mx.grad(f)
x = mx.array([1.0, 2.0, 3.0])
grad_value = grad_f(x)  # [2.0, 4.0, 6.0]

# Vectorization
def scalar_fn(x):
    return x ** 2

vector_fn = mx.vmap(scalar_fn)
vector_fn(mx.array([1.0, 2.0, 3.0]))  # [1.0, 4.0, 9.0]

# Combined transformations
grad_vector_fn = mx.grad(mx.vmap(scalar_fn))

Compilation

import mlx.core as mx

@mx.compile
def optimized_fn(x):
    return mx.sum(x ** 2)

# With state tracking
state = [mx.array(1.0)]

@mx.compile(inputs=state, outputs=state)
def stateful_fn(x):
    result = x + state[0]
    state[0] = result
    return result

Neural Networks (mlx.nn)

Building a Basic Neural Network

import mlx.core as mx
import mlx.nn as nn

class MLP(nn.Module):
    def __init__(self, in_dims, hidden_dims, out_dims):
        super().__init__()
        self.layers = [
            nn.Linear(in_dims, hidden_dims),
            nn.Linear(hidden_dims, out_dims)
        ]
    
    def __call__(self, x):
        for i, layer in enumerate(self.layers[:-1]):
            x = layer(x)
            x = mx.maximum(x, 0)  # ReLU activation
        return self.layers[-1](x)

# Create model
model = MLP(10, 128, 1)

# Initialize parameters
mx.eval(model.parameters())

# Access parameters
params = model.parameters()

Common Layers

# Linear layer
linear = nn.Linear(input_dim, output_dim)

# Convolutional layer
conv = nn.Conv2d(in_channels, out_channels, kernel_size=3)

# Layer normalization
norm = nn.LayerNorm(dim)

# Dropout (for training)
dropout = nn.Dropout(p=0.5)

# Multi-head attention
attention = nn.MultiHeadAttention(dim, num_heads)

Loss Functions

import mlx.nn.losses as losses

# Common loss functions
mse_loss = losses.mse_loss(predictions, targets)
bce_loss = losses.binary_cross_entropy(predictions, targets)
ce_loss = losses.cross_entropy(predictions, targets)

Optimizers (mlx.optimizers)

import mlx.optimizers as optim

# Create optimizer
optimizer = optim.SGD(learning_rate=0.01)
# Or
optimizer = optim.Adam(learning_rate=0.001, betas=(0.9, 0.999))

# Update model with gradients
optimizer.update(model, gradients)

# Evaluate optimizer state and model parameters
mx.eval(optimizer.state, model.parameters())

Training Loop Pattern

import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim

# Create model
model = MyModel()
mx.eval(model.parameters())

# Define loss function
def loss_fn(model, x, y):
    y_pred = model(x)
    return nn.losses.mse_loss(y_pred, y)

# Create gradient function and optimizer
loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
optimizer = optim.Adam(learning_rate=0.001)

# Training loop
for epoch in range(num_epochs):
    for x_batch, y_batch in data_loader:
        # Forward and backward pass
        loss, grads = loss_and_grad_fn(model, x_batch, y_batch)
        
        # Update model parameters
        optimizer.update(model, grads)
        
        # Evaluate parameters and optimizer state
        mx.eval(model.parameters(), optimizer.state)

MLX-LM Commands

Model Generation

# Generate text with a model
mlx_lm.generate --model mistralai/Mistral-7B-Instruct-v0.3 --prompt "hello"

# Stream text generation
mlx_lm.generate --model mistralai/Mistral-7B-Instruct-v0.3 --prompt "hello" --stream

# Set generation parameters
mlx_lm.generate --model <model_name> --prompt "hello" --max-tokens 100 --temperature 0.7 --top-p 0.9

Model Conversion

# Convert Hugging Face model to MLX format
mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3

# Convert and quantize to 4-bit
mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 -q

# Convert, quantize, and upload to Hugging Face
mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 -q --upload-repo <username>/<repo-name>

Interactive Chat

# Start interactive chat with a model
mlx_lm.chat --model mistralai/Mistral-7B-Instruct-v0.3

# Use a local model
mlx_lm.chat --model ./path/to/local/model

Fine-tuning with LoRA

# Basic LoRA fine-tuning
mlx_lm.lora --model mistralai/Mistral-7B-v0.1 --train --data ./my_data_folder

# Set specific parameters
mlx_lm.lora \
  --model mistralai/Mistral-7B-v0.1 \
  --train \
  --data ./my_data_folder \
  --batch-size 1 \
  --num-layers 4 \
  --iters 500

# Use quantized model (QLoRA)
mlx_lm.lora --model <quantized_model_path> --train --data ./my_data_folder

# Test a fine-tuned model
mlx_lm.lora \
  --model <path_to_model> \
  --adapter-path <path_to_adapters> \
  --data <path_to_data> \
  --test

# Generate with a fine-tuned model
mlx_lm.generate \
  --model <path_to_model> \
  --adapter-path <path_to_adapters> \
  --prompt "<your_prompt>"

Fusing Adapters

# Fuse LoRA adapters with the original model
mlx_lm.fuse \
  --model <path_to_model> \
  --adapter-path <path_to_adapters> \
  --save-path <output_path>

# Fuse and upload to Hugging Face
mlx_lm.fuse \
  --model <path_to_model> \
  --adapter-path <path_to_adapters> \
  --save-path <output_path> \
  --upload-name <username>/<repo-name>

# Export to GGUF format
mlx_lm.fuse \
  --model <path_to_model> \
  --adapter-path <path_to_adapters> \
  --export-gguf

Model Management

# Scan all locally cached models
mlx_lm.manage --scan

# Delete specific models
mlx_lm.manage --delete --pattern <model_name_pattern>

API Server

# Run OpenAI-compatible API server
mlx_lm.server

# Interact with the server
curl localhost:8080/v1/chat/completions -d '{
  "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", 
  "max_completion_tokens": 2000, 
  "messages": [{"role": "user", "content": "Hello there"}]
}'

Swift MLX Integration

// Add dependency in Package.swift
dependencies: [
    .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.10.0")
]

// Import packages
import MLX
import MLXNN
import MLXOptimizers
import MLXRandom

Resource Links

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment