# Install SGLang with all dependencies
pip install "sglang[all]"
# Launch server with Llama 3.1 8B
python -m sglang.launch_server \
--model-path meta-llama/Llama-3.1-8B-Instruct \
--port 30000
# Or use Docker
docker run --gpus all -p 30000:30000 \
lmsys/sglang:latest \
python -m sglang.launch_server \
--model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0
The TreeNode class is the building block of the radix tree. Each node stores children, KV cache references, lock reference counters, and LRU timestamps. This implements the RadixAttention concept from Core Concepts.
class RadixKey:
def __init__(self, token_ids, extra_key=None, is_bigram=False):
self.token_ids = token_ids
self.extra_key = extra_key # namespace isolation (e.g., lora_id)
self.is_bigram = is_bigram
class TreeNode:
counter = 0
def __init__(self, id=None, priority=0):
self.children = defaultdict(TreeNode)
self.parent = None
self.key = None # token sequence on edge from parent
self.value = None # GPU memory indices for KV cache
self.lock_ref = 0 # prevents eviction when > 0
self.last_access_time = time.monotonic()
self.hit_count = 0
self.priority = priority
The match_prefix() method walks the radix tree to find the longest cached prefix. This is the operation that enables KV cache reuse across requests.
def match_prefix(self, params):
"""Find the longest cached prefix in the radix tree."""
key = params.key
key, _ = self.maybe_bigram_convert(key)
if self.disable or len(key) == 0:
return empty_match_result()
# Page-align the key for memory efficiency
if self.page_size != 1:
page_aligned_len = len(key) // self.page_size * self.page_size
key = key[:page_aligned_len]
# Walk the tree from root
value, last_node = self._match_prefix_helper(
self.root_node, key
)
if value:
value = torch.cat(value) # concatenate GPU memory indices
return MatchResult(
device_indices=value, last_device_node=last_node
)
The main scheduling loop implements continuous batching. It runs indefinitely, pulling requests and dispatching GPU work.
def event_loop_normal(self):
"""A normal scheduler loop."""
while True:
recv_reqs = self.recv_requests()
self.process_input_requests(recv_reqs)
batch = self.get_next_batch_to_run()
self.cur_batch = batch
if batch:
result = self.run_batch(batch)
self.process_batch_result(batch, result)
else:
self.self_check_during_idle()
self.last_batch = batch
The scheduling policy determines request priority. LPM sorts by longest prefix match to maximize cache reuse.
class CacheAwarePolicy(Enum):
LPM = "lpm" # longest prefix match
DFS_WEIGHT = "dfs-weight"
class CacheAgnosticPolicy(Enum):
FCFS = "fcfs" # first come first serve
LOF = "lof" # longest output first
RANDOM = "random"
ROUTING_KEY = "routing-key"
def calc_priority(self, waiting_queue, running_batch=None):
if self.policy == CacheAgnosticPolicy.FCFS:
return False
policy = self._determine_active_policy(waiting_queue)
if isinstance(policy, CacheAwarePolicy):
temporary_deprioritized = self._compute_prefix_matches(
waiting_queue, policy
)
if policy == CacheAwarePolicy.LPM:
SchedulePolicy._sort_by_longest_prefix(
waiting_queue, temporary_deprioritized
)
The XGrammar backend compiles grammar specs into token bitmasks for efficient constrained generation.
class XGrammarGrammarBackend(BaseGrammarBackend):
def __init__(self, tokenizer, vocab_size,
model_eos_token_ids=None, any_whitespace=True):
super().__init__()
if hasattr(tokenizer, "init_xgrammar"):
tokenizer_info, override_stop_tokens = (
tokenizer.init_xgrammar()
)
else:
tokenizer_info = TokenizerInfo.from_huggingface(
tokenizer, vocab_size=vocab_size,
stop_token_ids=model_eos_token_ids
)
self.grammar_compiler = GrammarCompiler(
tokenizer_info=tokenizer_info
)
self.vocab_size = vocab_size
The intermediate representation defines the SGLang language primitives that capture program structure.
class SglGen(SglExpr):
"""Generate text from the model."""
def __init__(self, name, max_tokens, stop, **kwargs):
self.name = name
self.sampling_params = SglSamplingParams(
max_new_tokens=max_tokens, stop=stop, **kwargs
)
class SglSelect(SglExpr):
"""Select from predefined choices."""
def __init__(self, name, choices, temperature):
self.name = name
self.choices = choices
class SglFork(SglExpr):
"""Create parallel execution branches."""
def __init__(self, number, position_ids_offset=None):
self.number = number
GPU Memory: Allocate 2x model size for serving (weights + KV cache). Llama-3.1-70B in FP16 needs ~140GB across 2-4 A100 80GB GPUs.
Monitoring: Use --enable-metrics for Prometheus. Watch sglang_cache_hit_rate, sglang_token_throughput, and sglang_time_to_first_token_seconds.
Quantization: Use FP8 (--quantization fp8) for 2x memory savings with minimal quality loss.
Scaling Out: Use --dp-size N for data parallelism. Consider prefill/decode disaggregation for latency-sensitive workloads.