Getting Started
# Install Ollama (macOS)
brew install ollama
# Install Ollama (Linux)
curl -fsSL https://ollama.com/install.sh | sh
# Start the server and run a model
ollama serve &
ollama run llama3.2
# Use the API
curl http://localhost:11434/api/chat \
-d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello"}]}'
Source Code Walkthrough
Each core concept maps to specific source files in the Ollama codebase. Below are annotated excerpts from the actual repository.
Model Library -- Name Parsing
The Name type is the entry point for all model operations. Every ollama run or API call starts by parsing the model name into its components.
type Name struct {
Host string
Namespace string
Model string
Tag string
ProtocolScheme string
}
// ParseName parses a model reference string
// Format: [host/][namespace/]model[:tag]
// Defaults: registry.ollama.ai / library / latest
func ParseName(s string, fill FillKind) Name {
var r Name
// Parsing works backward through the string:
// 1. Last colon separates the tag
// 2. Slashes separate host, namespace, model
// 3. "://" detects protocol scheme
return r
}
// DisplayShortest omits default components
// "registry.ollama.ai/library/llama3.2:latest" -> "llama3.2"
func (n Name) DisplayShortest() string { /* ... */ }
REST API -- Route Registration
The GenerateRoutes method wires up all HTTP endpoints, including both Ollama-native and OpenAI-compatible APIs.
// GenerateRoutes sets up all HTTP endpoints
func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) {
// Ollama-native endpoints
// POST /api/generate -> GenerateHandler
// POST /api/chat -> ChatHandler
// POST /api/embed -> EmbedHandler
// POST /api/pull -> PullHandler
// GET /api/tags -> ListHandler
// POST /api/show -> ShowHandler
// DELETE /api/delete -> DeleteHandler
// OpenAI-compatible endpoints
// POST /v1/chat/completions -> wraps ChatHandler
// POST /v1/completions -> wraps GenerateHandler
// POST /v1/embeddings -> wraps EmbedHandler
}
Quantization -- Options and Runner Config
The Options struct controls how inference runs, including GPU allocation and sampling parameters.
type Options struct {
Runner
NumKeep int `json:"num_keep,omitempty"`
Seed int `json:"seed,omitempty"`
NumPredict int `json:"num_predict,omitempty"`
TopK int `json:"top_k,omitempty"`
TopP float32 `json:"top_p,omitempty"`
Temperature float32 `json:"temperature,omitempty"`
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
Stop []string `json:"stop,omitempty"`
}
type Runner struct {
NumCtx int `json:"num_ctx,omitempty"` // Context window size
NumBatch int `json:"num_batch,omitempty"` // Batch size
NumGPU int `json:"num_gpu,omitempty"` // GPU layers
MainGPU int `json:"main_gpu,omitempty"` // Primary GPU
UseMMap *bool `json:"use_mmap,omitempty"` // Memory-map model
NumThread int `json:"num_thread,omitempty"` // CPU threads
}
Scheduler -- Model Lifecycle Management
The Scheduler manages loaded models with reference counting, keep-alive timers, and memory-aware eviction.
type Scheduler struct {
pendingReqCh chan *LlmRequest // Incoming load requests
finishedReqCh chan *LlmRequest // Completed notifications
expiredCh chan *runnerRef // Expired model timers
unloadedCh chan any // VRAM recovery confirms
loadedMu sync.Mutex
loaded map[string]*runnerRef // Currently loaded models
}
type runnerRef struct {
refCount uint // Active requests (prevents unload)
llama llm.LlamaServer // Inference subprocess
gpus []ml.DeviceID // Assigned GPUs
vramSize uint64 // VRAM consumption
sessionDuration time.Duration // Keep-alive timeout
expireTimer *time.Timer // Unload timer
}
Runner -- Inference Backend Interface
The LlamaServer interface abstracts over two backends: legacy llama.cpp and the newer Ollama/MLX engine.
type LlamaServer interface {
Load(ctx context.Context, systemInfo ml.SystemInfo,
gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error)
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest,
fn func(CompletionResponse)) error
Embedding(ctx context.Context, input string) ([]float32, int, error)
Close() error
MemorySize() (total, vram uint64)
Pid() int
}
GGUF Format -- Model Conversion
The ConvertModel function transforms models from training formats (SafeTensors) to GGUF for inference.
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
// 1. Read config.json to identify architecture
bts, err := fs.ReadFile(fsys, "config.json")
// 2. Route to architecture-specific converter
// Supports 25+ architectures: Llama, Mistral,
// Gemma, Qwen, Phi, and more
// 3. Parse tensors with name remapping
ts, err := parseTensors(fsys,
strings.NewReplacer(conv.Replacements()...))
// 4. Load tokenizer vocabulary
// 5. Write GGUF binary output
return ggml.WriteGGUF(ws, kv, ts)
}
types/model/name.go, REST API to server/routes.go, Scheduler to server/sched.go, Runner to llm/server.go, GGUF to convert/convert.go, and Quantization/Modelfile config to api/types.go.