{
  "categories": [
    {
      "id": "precision-formats",
      "label": "Precision Formats",
      "order": 0,
      "entries": [
        "bf16",
        "e2m1",
        "fp16",
        "fp32",
        "fp8",
        "int4",
        "int8",
        "nf4"
      ]
    },
    {
      "id": "quantization-basics",
      "label": "Quantization Basics",
      "order": 1,
      "entries": [
        "calibration-data",
        "context-window",
        "dequantization",
        "model-size-memory",
        "per-group-quantization",
        "ptq",
        "qat",
        "quantization",
        "token",
        "w4a16",
        "w8a8"
      ]
    },
    {
      "id": "quantization-methods",
      "label": "Quantization Methods",
      "order": 2,
      "entries": [
        "aqlm",
        "awq",
        "bitsandbytes",
        "eetq",
        "exl2",
        "gptq",
        "hqq",
        "imatrix",
        "kv-cache-quantization",
        "marlin",
        "quanto",
        "quip-sharp",
        "smoothquant",
        "spinquant",
        "unsloth-dynamic"
      ]
    },
    {
      "id": "formats",
      "label": "Formats",
      "order": 3,
      "entries": [
        "format-coreml",
        "format-ct2",
        "format-onnx",
        "format-openvino",
        "ggml",
        "gguf",
        "iq-quants",
        "k-quants",
        "mxfp4-moe",
        "safetensors"
      ]
    },
    {
      "id": "attention-variants",
      "label": "Attention & Memory",
      "order": 4,
      "entries": [
        "attention-mask",
        "cross-attention",
        "flash-attention",
        "gqa",
        "kv-cache",
        "linear-attention",
        "mha",
        "mla",
        "mqa",
        "paged-attention",
        "sdpa",
        "self-attention",
        "swa"
      ]
    },
    {
      "id": "position-encodings",
      "label": "Position Encodings",
      "order": 5,
      "entries": [
        "abf",
        "alibi",
        "learned-pe",
        "ntk-rope",
        "position-interpolation",
        "rope",
        "sinusoidal-pe",
        "yarn"
      ]
    },
    {
      "id": "layer-types",
      "label": "Layer Types",
      "order": 6,
      "entries": [
        "backbone",
        "clip",
        "diffusion-models",
        "ffn",
        "geglu",
        "layernorm",
        "mamba",
        "model-head",
        "ple",
        "pre-norm",
        "residual-connection",
        "rmsnorm",
        "rwkv",
        "seq2seq",
        "swiglu",
        "vit",
        "vlm",
        "weight-tying"
      ]
    },
    {
      "id": "scaling-patterns",
      "label": "Scaling & Serving",
      "order": 7,
      "entries": [
        "api-provider",
        "chain-of-thought",
        "chatbot-arena",
        "chunked-prefill",
        "context-engineering",
        "continuous-batching",
        "data-contamination",
        "data-parallelism",
        "dense-models",
        "evals",
        "expert-parallelism",
        "expert-routing",
        "gpqa",
        "gpu-memory",
        "gsm8k",
        "humaneval",
        "inference-engine",
        "inference-metrics",
        "inference",
        "knowledge-distillation",
        "latency",
        "llm-as-judge",
        "llmops",
        "metr",
        "mmlu",
        "model-merging",
        "moe",
        "mt-bench",
        "observability",
        "perplexity",
        "prefix-caching",
        "pruning",
        "rag",
        "reasoning-models",
        "red-teaming",
        "rouge",
        "scaling-test-time",
        "speculative-decoding",
        "test-time-compute",
        "thinking-tokens",
        "tp-pp",
        "truthfulqa",
        "zero"
      ]
    },
    {
      "id": "training-pipeline",
      "label": "Training Pipeline",
      "order": 8,
      "entries": [
        "causal-lm",
        "cpt",
        "dpo",
        "gradient-checkpointing",
        "grpo",
        "kto",
        "mixed-precision-training",
        "orpo",
        "post-training",
        "pre-training",
        "rlhf",
        "rlvr",
        "scaling-laws",
        "sft",
        "simpo",
        "training-recipe",
        "transfer-learning"
      ]
    },
    {
      "id": "sampling-decoding",
      "label": "Sampling & Decoding",
      "order": 8,
      "entries": [
        "beam-search",
        "greedy-decoding",
        "min-p",
        "repetition-penalty",
        "structured-output",
        "temperature",
        "top-k",
        "top-p"
      ]
    },
    {
      "id": "fine-tuning-methods",
      "label": "Fine-Tuning Methods",
      "order": 9,
      "entries": [
        "adapters",
        "dora",
        "fine-tuning",
        "full-ft",
        "ia3",
        "lora",
        "peft",
        "qlora"
      ]
    },
    {
      "id": "model-naming",
      "label": "Model Naming",
      "order": 10,
      "entries": [
        "chat-template",
        "license-types",
        "model-card",
        "open-weights",
        "size-A",
        "size-B",
        "size-E",
        "size-T",
        "size-context",
        "size-x",
        "tag-alignment",
        "tag-base",
        "tag-coder",
        "tag-embed",
        "tag-ft",
        "tag-guard",
        "tag-hf",
        "tag-instruct",
        "tag-long",
        "tag-math",
        "tag-merged",
        "tag-moe-suffix",
        "tag-mrl",
        "tag-reward",
        "tag-scale",
        "tag-scout-maverick",
        "tag-vision"
      ]
    },
    {
      "id": "hf-organizations",
      "label": "HF Organizations",
      "order": 11,
      "entries": [
        "huggingface-hub",
        "olmo",
        "org-bartowski",
        "org-deepseek",
        "org-google",
        "org-meta",
        "org-microsoft",
        "org-mistral",
        "org-mlx-community",
        "org-mradermacher",
        "org-nousresearch",
        "org-qwen",
        "org-thebloke",
        "org-turboderp"
      ]
    },
    {
      "id": "serving-tools",
      "label": "Serving Tools",
      "order": 12,
      "entries": [
        "inspect-eval",
        "tool-accelerate",
        "tool-crewai",
        "tool-dask",
        "tool-exllamav2",
        "tool-langchain",
        "tool-langfuse",
        "tool-langsmith",
        "tool-llamacpp",
        "tool-llamaindex",
        "tool-lmstudio",
        "tool-n8n",
        "tool-ollama",
        "tool-outlines",
        "tool-ray",
        "tool-sglang",
        "tool-tensorrt-llm",
        "tool-tgi",
        "tool-triton",
        "tool-vllm"
      ]
    },
    {
      "id": "tokenizers",
      "label": "Tokenizers",
      "order": 13,
      "entries": [
        "tokenizer-bpe",
        "tokenizer-spm",
        "tokenizer-tiktoken",
        "tokenizer-wordpiece"
      ]
    },
    {
      "id": "datasets-recipes",
      "label": "Datasets & Recipes",
      "order": 14,
      "entries": [
        "data-deduplication",
        "data-mixing",
        "dataset-dolphin",
        "dataset-evolinstruct",
        "dataset-hermes",
        "dataset-orca",
        "dataset-platypus",
        "dataset-ultrachat",
        "synthetic-data"
      ]
    },
    {
      "id": "agents-and-tools",
      "label": "Agents & Tool Use",
      "order": 15,
      "entries": [
        "a2a",
        "agentic-ai",
        "dspy",
        "function-calling",
        "guardrails",
        "mcp",
        "pydantic-ai",
        "react"
      ]
    },
    {
      "id": "safety-alignment",
      "label": "Safety & Alignment",
      "order": 16,
      "entries": [
        "constitutional-ai",
        "hallucination",
        "jailbreak",
        "prompt-injection",
        "reward-model",
        "rlaif"
      ]
    },
    {
      "id": "prompting",
      "label": "Prompting",
      "order": 17,
      "entries": [
        "few-shot",
        "prompt-engineering",
        "system-prompt",
        "zero-shot"
      ]
    },
    {
      "id": "embeddings-retrieval",
      "label": "Embeddings & Retrieval",
      "order": 18,
      "entries": [
        "embeddings",
        "faiss",
        "vector-database"
      ]
    }
  ],
  "entries": {
    "a2a": {
      "id": "a2a",
      "name": "A2A Protocol",
      "expansion": "Agent-to-Agent Protocol (Google)",
      "category": "agents-and-tools",
      "oneliner": "Google's open protocol for AI agents to discover each other's capabilities and collaborate on tasks across different frameworks and vendors.",
      "explanation": "The Agent-to-Agent (A2A) protocol is an open standard from Google for enabling AI agents built on different frameworks to communicate and collaborate. Each agent publishes an Agent Card describing its capabilities, and agents can delegate tasks to each other through a standardized JSON-RPC interface. A2A complements Anthropic's MCP — while MCP connects agents to tools and data sources, A2A connects agents to other agents.",
      "fundamentals": "Core concepts: Agent Card (JSON capability description published at /.well-known/agent.json), Task (unit of work with lifecycle: submitted → working → completed/failed), Message (communication between agents within a task), Artifact (output produced by a task). Transport: HTTP + JSON-RPC 2.0, with optional SSE for streaming. Discovery: clients fetch Agent Cards to understand what an agent can do before sending tasks. Designed for interop across LangChain, CrewAI, Vertex AI, and other frameworks.",
      "related": [
        "mcp",
        "agentic-ai",
        "function-calling"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "A2A Protocol spec",
          "url": "https://github.com/google/A2A"
        },
        {
          "label": "Google A2A announcement",
          "url": "https://developers.googleblog.com/en/a2a-a-new-era-of-agent-interoperability/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "abf": {
      "id": "abf",
      "name": "ABF",
      "expansion": "Adjusted Base Frequency",
      "category": "position-encodings",
      "oneliner": "Simplest long-context approach: increase RoPE base from 10,000 to 500K-1M during training. No post-hoc scaling needed.",
      "explanation": "Adjusted Base Frequency extends a model's context window by increasing the base constant in the RoPE position encoding formula from the default 10,000 to a much larger value. Longer wavelengths let the model distinguish positions over longer sequences without confusion, while local token discrimination is preserved. This simple change requires continued pre-training. Llama 3 uses a base of 500,000 and Qwen 2 uses 1,000,000.",
      "fundamentals": "$\\theta_i$ = B^(-2i/d). Max wavelength = 2$\\pi$·B. base=10K → ~63K positions; base=500K → ~3.1M positions; base=1M → ~6.3M positions. High-freq (i=0): $\\theta_0$=1 always (base^0=1) — local info preserved for any base. Mathematically equivalent to NTK-aware with specific $\\alpha$, but applied during training not post-hoc. The model learns to use the extended position range natively — no distributional mismatch.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "ntk-rope",
        "position-interpolation",
        "pre-training",
        "rope",
        "token",
        "yarn"
      ],
      "sources": [
        "Meta, 'The Llama 3 Herd of Models,' 2024, arXiv:2407.21783",
        "Qwen Team, 'Qwen2 Technical Report,' 2024, arXiv:2407.10671",
        "Xiong et al., 'Effective Long-Context Scaling,' 2023, arXiv:2309.16039"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Llama 3 paper (rope_theta=500K)",
          "url": "https://arxiv.org/abs/2407.21783"
        },
        {
          "label": "Qwen2 Technical Report (rope_theta=1M)",
          "url": "https://arxiv.org/abs/2407.10671"
        }
      ]
    },
    "adapters": {
      "id": "adapters",
      "name": "Adapters (General Concept)",
      "expansion": "Adapter Modules — the broad category of injected trainable modules",
      "category": "fine-tuning-methods",
      "oneliner": "Small trainable modules inserted into frozen models. Houlsby (serial bottleneck), Pfeiffer (efficient), parallel, LoRA are all adapter variants.",
      "explanation": "Adapters are small trainable modules inserted into the layers of a frozen pre-trained model to teach it new tasks without updating all its weights. They dramatically reduce the cost of fine-tuning because only the adapter parameters are trained while the original model stays intact. Variants include Houlsby bottleneck adapters, Pfeiffer single-layer adapters, parallel adapters, and LoRA.",
      "fundamentals": "Houlsby: adapter(x) = x + W_up·$\\sigma$(W_down·x). 4dr params/layer. Pfeiffer: 2dr/layer. LoRA: 2dr per target module. Inference: Houlsby 5-10% latency. LoRA merged: zero.",
      "seen_in": [
        "model-name",
        "filename"
      ],
      "related": [
        "ia3",
        "inference",
        "lora",
        "peft"
      ],
      "foundational_papers": [
        {
          "title": "Parameter-Efficient Transfer Learning for NLP",
          "authors": "Houlsby et al.",
          "venue": "ICML 2019",
          "arxiv": "1902.00751"
        },
        {
          "title": "Towards a Unified View of Parameter-Efficient Transfer Learning",
          "authors": "He et al.",
          "venue": "ICLR 2022",
          "arxiv": "2110.04366"
        },
        {
          "title": "AdapterFusion: Non-Destructive Task Composition for Transfer Learning",
          "authors": "Pfeiffer et al.",
          "venue": "EACL 2021",
          "arxiv": "2005.00247"
        }
      ],
      "sources": [
        "Houlsby et al., arXiv:1902.00751"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "agentic-ai": {
      "id": "agentic-ai",
      "name": "Agentic AI",
      "expansion": "Agentic Artificial Intelligence",
      "category": "agents-and-tools",
      "oneliner": "AI systems that plan, use tools, retain memory, and execute multi-step tasks autonomously. The paradigm shift from prompt-response to goal-driven agents.",
      "explanation": "Agentic AI is the paradigm in which a language model serves as a reasoning engine inside an orchestration loop that can plan actions, invoke external tools, observe results, and iterate until a goal is met. Unlike a vanilla chatbot that produces a single reply, an agentic system breaks a request into sub-tasks, calls APIs or databases, writes and runs code, and self-corrects based on feedback.",
      "fundamentals": "Core loop: Observe -> Think -> Act -> Observe. The LLM receives the goal plus accumulated context (scratchpad), reasons about the next step, emits a structured tool call, the harness executes it, appends the result, and re-prompts. ReAct (Reason + Act) is the foundational pattern. Key design axes: single-agent vs. multi-agent, sequential vs. parallel tool calls, human-in-the-loop checkpoints, and memory persistence (short-term scratchpad vs. long-term vector store). Reliability depends on the model's instruction-following and the harness's error recovery.",
      "related": [
        "function-calling",
        "mcp",
        "rag",
        "inference-engine"
      ],
      "seen_in": [
        "documentation",
        "serving-config"
      ],
      "foundational_papers": [
        {
          "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
          "authors": "Yao et al.",
          "venue": "ICLR 2023",
          "arxiv": "2210.03629"
        },
        {
          "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
          "authors": "Schick et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2302.04761"
        }
      ],
      "resources": [
        {
          "label": "Agentic AI explained (MIT Sloan)",
          "url": "https://mitsloan.mit.edu/ideas-made-to-matter/agentic-ai-explained"
        },
        {
          "label": "What is Agentic AI (IBM)",
          "url": "https://www.ibm.com/think/topics/agentic-ai"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "alibi": {
      "id": "alibi",
      "name": "ALiBi",
      "expansion": "Attention with Linear Biases",
      "category": "position-encodings",
      "oneliner": "Adds a fixed linear penalty to attention scores based on query-key distance — no position embeddings at all, with good length extrapolation.",
      "explanation": "ALiBi is a position encoding method that adds a fixed distance-based penalty directly to attention scores instead of using position embeddings. Nearby tokens receive a small penalty while distant tokens receive a larger one, creating a natural decay of attention with distance. This simple approach allows models to extrapolate to longer sequences than they were trained on. ALiBi is used in BLOOM, Falcon, and MPT, but has become less common as RoPE scaling methods improved.",
      "fundamentals": "Modified attention: a_{i,j} = $q_i^T$·$k_j$ - m_h·|i-j|. For H=8 heads: slopes = {1/2, 1/4, 1/8, ..., 1/256}. Steep slopes = strong locality, gentle slopes = far attention. Causal: bias = -m_h·(i-j) for j≤i. The bias matrix is lower-triangular. $O(1)$ additional params (fixed slopes). Extrapolation works because linear function is defined for any distance — no OOD issue. Cannot learn to attend strongly to a specific far position through positional info alone (content can override).",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "rope",
        "sinusoidal-pe",
        "learned-pe"
      ],
      "sources": [
        "Press et al., 'Train Short, Test Long,' ICLR 2022, arXiv:2108.12409"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Train Short, Test Long: Attention with Linear Biases Enables Input Length Generalization",
          "authors": "Press et al.",
          "venue": "ICLR 2022",
          "arxiv": "2108.12409"
        }
      ]
    },
    "api-provider": {
      "id": "api-provider",
      "name": "API Providers",
      "expansion": "LLM API Providers",
      "category": "scaling-patterns",
      "oneliner": "Companies offering hosted LLM inference via API — OpenAI, Anthropic, Google, Cohere, Mistral, Together AI, Groq, Fireworks, and others.",
      "explanation": "API providers host language models and offer them as a service via HTTP endpoints, so you can use powerful models without managing GPU infrastructure. OpenAI (GPT-4, o1), Anthropic (Claude), and Google (Gemini) offer proprietary models. Together AI, Fireworks, and Groq offer open-weight models (Llama, Mistral, Qwen) at competitive prices with high throughput. Groq is notable for custom LPU hardware delivering extremely low latency.",
      "related": [
        "inference",
        "inference-engine"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "OpenAI API",
          "url": "https://platform.openai.com"
        },
        {
          "label": "Together AI",
          "url": "https://together.ai"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "aqlm": {
      "id": "aqlm",
      "name": "AQLM",
      "expansion": "Additive Quantization of Language Models",
      "category": "quantization-methods",
      "oneliner": "Extreme compression (2-bit) using learned additive codebooks — highest quality at very low bitwidths.",
      "explanation": "AQLM is a weight compression method that represents groups of model weights as combinations of entries from multiple small learned codebooks. Each weight group is encoded as a sum of codewords drawn from separate codebooks, a technique called additive vector quantization. This approach dominates at extreme 2-bit compression where simpler rounding methods fall apart. Quantization is slow but inference runs efficiently with custom GPU kernels.",
      "fundamentals": "For a group of K weights: w $\\approx$ c1[i1] + c2[i2] + ... + cM[iM] where c_m are learned codebooks and i_m are indices stored as compact integers. Training alternates: fix codebooks, find best indices; fix indices, optimize codebooks. Effective bpw depends on M, B, K parameters.",
      "seen_in": [
        "model-name",
        "repo-name"
      ],
      "related": [
        "inference",
        "quantization",
        "quip-sharp"
      ],
      "sources": [
        "Egiazarian et al., 'AQLM,' arXiv:2401.06118"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "AQLM: Extreme Compression of Language Models via Additive Quantization",
          "authors": "Egiazarian et al.",
          "venue": "ICML 2024",
          "arxiv": "2401.06118"
        }
      ]
    },
    "attention-mask": {
      "id": "attention-mask",
      "name": "Attention Mask",
      "expansion": "Attention Mask",
      "category": "attention-variants",
      "oneliner": "A binary mask that controls which token pairs can attend to each other, handling padding, causality, and custom patterns.",
      "explanation": "An attention mask is a binary tensor that controls which token positions can attend to each other during the attention computation. It serves two main purposes: padding masks prevent the model from attending to placeholder tokens when batching sequences of different lengths, and causal masks enforce the left-to-right constraint in autoregressive generation. In HuggingFace Transformers, real tokens are marked with 1 and padding with 0.",
      "fundamentals": "Applied as: $\\text{Attention}(Q,K,V) = \\text{softmax}\\left(\\frac{QK^\\top}{\\sqrt{d_k}} + M_{\\text{add}}\\right)V$, where $M_{\\text{add}}$ contains $0$ for allowed positions and $-\\infty$ for blocked positions. For padded batches, the padding mask has shape $1 \\times L$ and is broadcast to $L \\times L$, combined with the causal mask via element-wise minimum.",
      "related": [
        "mha",
        "token"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Attention Is All You Need",
          "authors": "Vaswani et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03762"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace Glossary: Attention mask",
          "url": "https://huggingface.co/docs/transformers/glossary#attention-mask"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "awq": {
      "id": "awq",
      "name": "AWQ",
      "expansion": "Activation-Aware Weight Quantization",
      "category": "quantization-methods",
      "oneliner": "Identifies the most important ~1% of weight channels via activation magnitudes, then protects them with per-channel scaling.",
      "explanation": "AWQ is a weight quantization method that identifies which weight channels matter most by looking at activation patterns during inference. Channels that consistently produce large activations amplify any rounding errors, so AWQ scales those channels before quantizing to protect them. It only needs a quick forward pass for calibration with no gradients or Hessian, making it fast even for 70B models.",
      "fundamentals": "1) Calibrate: s_x(j) = mean(|X[:,j]|) per channel j. 2) Compute scaling: s(j) = s_x(j)^$\\alpha$, $\\alpha$ found by grid search minimizing $\\|\\|Q(W·diag(s))·(X/diag(s)) - WX\\|\\|$. 3) Store W' = W·diag(s), quantize W'. 4) At runtime: activations divided by s (fused into preceding LayerNorm).",
      "seen_in": [
        "model-name",
        "filename"
      ],
      "related": [
        "gptq",
        "inference",
        "layernorm",
        "quantization"
      ],
      "sources": [
        "Lin et al., 'AWQ,' MLSys 2024, arXiv:2306.00978"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration",
          "authors": "Lin et al.",
          "venue": "MLSys 2024",
          "arxiv": "2306.00978"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace — Selecting a Quantization Method",
          "url": "https://huggingface.co/docs/transformers/quantization/selecting"
        }
      ]
    },
    "backbone": {
      "id": "backbone",
      "name": "Backbone",
      "expansion": "Model Backbone (trunk)",
      "category": "layer-types",
      "oneliner": "The main body of a model — embedding layer plus transformer blocks — that produces contextual representations before any task-specific head.",
      "explanation": "The backbone is the main body of a transformer model, comprising the embedding table, positional encoding, and the full stack of transformer layers. It converts a sequence of token IDs into rich contextual hidden states that a task-specific head then uses to make predictions. When people reference a model like LLaMA-3 70B, they mostly mean the backbone. The backbone is pretrained once and can be reused across many tasks via different heads or fine-tuning.",
      "fundamentals": "A decoder-only backbone maps tokens $x_1, \\ldots, x_n$ to hidden states through $L$ layers. Each layer: $h' = h + \\text{Attention}(\\text{Norm}(h))$, then $h'' = h' + \\text{FFN}(\\text{Norm}(h'))$. Total params $\\approx L \\cdot 12d^2$ for standard MHA with $4d$ FFN, plus embedding $|V| \\cdot d$.",
      "related": [
        "ffn",
        "mha",
        "model-head",
        "residual-connection",
        "token"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Attention Is All You Need",
          "authors": "Vaswani et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03762"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace Model summary",
          "url": "https://huggingface.co/docs/transformers/model_summary"
        },
        {
          "label": "Sebastian Raschka — The Big LLM Architecture Comparison",
          "url": "https://magazine.sebastianraschka.com/p/the-big-llm-architecture-comparison"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "beam-search": {
      "id": "beam-search",
      "name": "Beam Search",
      "expansion": "Beam Search Decoding",
      "category": "sampling-decoding",
      "oneliner": "A decoding algorithm that tracks the top-k most probable partial sequences at each step, finding higher-quality outputs than greedy decoding at the cost of more compute.",
      "explanation": "Beam search maintains a set of k candidate sequences (the beam) at each generation step, expanding each by one token and keeping only the k most probable paths. This explores a wider search space than greedy decoding, often finding better overall sequences. It was the standard decoding method for machine translation before sampling became popular for chat.",
      "related": [
        "greedy-decoding",
        "temperature",
        "top-p"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Decoding strategies",
          "url": "https://huggingface.co/blog/mlabonne/decoding-strategies"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "bf16": {
      "id": "bf16",
      "name": "bf16 / bfloat16",
      "expansion": "Brain Floating Point 16-bit (Google Brain Float 16)",
      "category": "precision-formats",
      "oneliner": "A 16-bit float with fp32's exponent range but reduced mantissa — designed for deep learning training without loss scaling.",
      "explanation": "BFloat16 is a 16-bit floating-point format developed by Google Brain that keeps the same 8-bit exponent as 32-bit float, giving it an enormous numerical range. This means training gradients almost never overflow and loss scaling is unnecessary, unlike with fp16. The tradeoff is a shorter mantissa giving less precision, but neural networks tolerate this well. BFloat16 is the default training format for most major labs and runs on TPUs, NVIDIA Ampere GPUs, and newer.",
      "fundamentals": "Bit layout: 1 sign | 8 exponent (bias 127) | 7 mantissa. Same exponent as fp32 — conversion is just truncating the lower 16 bits. Precision: ~2.4 decimal digits (vs fp16's ~3.3). Two values differing by less than 1 part in 128 map to the same bf16.",
      "seen_in": [
        "model-name",
        "filename",
        "training-config"
      ],
      "related": [
        "fp16",
        "fp32"
      ],
      "sources": [
        "Kalamkar et al., 'A Study of BFLOAT16 for Deep Learning Training,' arXiv:1905.12322"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "A Study of BFLOAT16 for Deep Learning Training",
          "authors": "Kalamkar et al.",
          "venue": "2019",
          "arxiv": "1905.12322"
        }
      ]
    },
    "bitsandbytes": {
      "id": "bitsandbytes",
      "name": "BitsAndBytes (bnb)",
      "expansion": "bitsandbytes — Tim Dettmers' quantization library",
      "category": "quantization-methods",
      "oneliner": "The library behind LLM.int8() and QLoRA. Offers 8-bit (absmax) and 4-bit (nf4/fp4) quantization with deep HuggingFace integration.",
      "explanation": "Bitsandbytes is a CUDA library that enables 8-bit and 4-bit quantization for large language models. Its 8-bit mode keeps statistical outlier features in 16-bit while compressing the rest, and its 4-bit mode powers QLoRA by loading base weights in the NF4 format while LoRA adapters train in 16-bit. It integrates with HuggingFace Transformers via simple load_in_8bit and load_in_4bit flags, making it the go-to tool for memory-efficient fine-tuning.",
      "fundamentals": "8-bit: per-vector absmax scaling. Outlier decomposition: X = X_normal + X_outlier, compute separately in int8 and fp16, sum results. 4-bit: per-group (g=64 default) with nf4 or fp4 dtype. Double quant: group the fp32 absmax values into blocks of 256, quantize to fp8 with their own absmax.",
      "seen_in": [
        "model-name",
        "repo-name"
      ],
      "related": [
        "adapters",
        "fp16",
        "fp32",
        "fp8",
        "int8",
        "lora",
        "nf4",
        "qlora",
        "quantization"
      ],
      "sources": [
        "Dettmers et al., 'LLM.int8(),' arXiv:2208.07339",
        "Dettmers et al., 'QLoRA,' arXiv:2305.14314"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale",
          "authors": "Dettmers et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2208.07339"
        },
        {
          "title": "QLoRA: Efficient Finetuning of Quantized Language Models",
          "authors": "Dettmers et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2305.14314"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace — bitsandbytes quantization",
          "url": "https://huggingface.co/docs/transformers/en/quantization/bitsandbytes"
        }
      ]
    },
    "calibration-data": {
      "id": "calibration-data",
      "name": "Calibration Data",
      "expansion": "Quantization Calibration Dataset",
      "category": "quantization-basics",
      "oneliner": "A small dataset (~128 samples) run through the model to collect activation statistics needed by quantization methods to set scale factors.",
      "explanation": "Calibration data is a small representative dataset used to measure activation ranges inside a model so quantization methods can set accurate scaling parameters. The standard practice is 128 samples of 2048 tokens, typically from the C4 or WikiText-2 datasets. Methods like GPTQ and AWQ require calibration, while simpler round-to-nearest approaches do not. Using data matched to the deployment domain yields slightly better quantized model quality.",
      "fundamentals": "For layer Y=XW+b: calibration records per-layer activation stats. Approaches: min/max (simple but outlier-sensitive), percentile (clips outliers), MSE-optimal (minimizes reconstruction error), entropy/KL-based (TensorRT). For GPTQ: calibration computes Hessian H = 2XX^T, used to solve layer-wise optimization min_Q $\\|\\|WX - QX\\|\\|^2$.",
      "seen_in": [
        "documentation",
        "quantization-scripts"
      ],
      "related": [
        "awq",
        "gptq",
        "imatrix",
        "quantization"
      ],
      "sources": [
        "Frantar et al., 'GPTQ,' arXiv:2210.17323",
        "Nagel et al., arXiv:2106.08295"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "GPTQ paper (established 128-sample C4 calibration standard)",
          "url": "https://arxiv.org/abs/2210.17323"
        },
        {
          "label": "White Paper on Neural Network Quantization",
          "url": "https://arxiv.org/abs/2106.08295"
        }
      ]
    },
    "causal-lm": {
      "id": "causal-lm",
      "name": "Causal Language Modeling",
      "expansion": "Causal (Autoregressive) Language Modeling",
      "category": "training-pipeline",
      "oneliner": "The training objective where a model predicts the next token using only the tokens to its left — the foundation of GPT-style generation.",
      "explanation": "Causal language modeling is the training objective where a model predicts the next token using only the tokens to its left, never peeking ahead. A causal attention mask ensures each position can only attend to preceding ones. This left-to-right constraint is the pre-training objective behind GPT, Llama, and Mistral, and it enables autoregressive text generation at inference time.",
      "fundamentals": "The causal LM objective maximises $\\mathcal{L} = \\sum_{t=1}^{T} \\log P(x_t \\mid x_{<t}; \\theta)$. The causal mask is a lower-triangular binary matrix $M$ where $M_{ij} = 1$ iff $j \\leq i$. Perplexity $= \\exp(-\\mathcal{L}/T)$ is the standard evaluation metric.",
      "related": [
        "attention-mask",
        "inference",
        "pre-training",
        "sft",
        "token"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Language Models are Few-Shot Learners",
          "authors": "Brown et al.",
          "venue": "NeurIPS 2020",
          "arxiv": "2005.14165"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace CLM training guide",
          "url": "https://huggingface.co/docs/transformers/tasks/language_modeling"
        },
        {
          "label": "Jay Alammar — The Illustrated GPT-2",
          "url": "https://jalammar.github.io/illustrated-gpt2/"
        },
        {
          "label": "Andrej Karpathy — Let's Build GPT",
          "url": "https://karpathy.ai/zero-to-hero.html"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "chain-of-thought": {
      "id": "chain-of-thought",
      "name": "Chain-of-Thought",
      "expansion": "Chain-of-Thought (CoT)",
      "category": "scaling-patterns",
      "oneliner": "Prompting or training a model to show intermediate reasoning steps before the final answer — converts System-2 problems into sequences of easier System-1 steps.",
      "explanation": "Chain-of-thought is a technique where a language model generates intermediate reasoning steps before producing a final answer. Introduced as a prompting method by Wei et al. in 2022, it dramatically improved accuracy on math and logic tasks. In the reasoning model era, CoT shifted from a prompting trick to a trained behavior: o1 and R1 are RL-trained to produce thousands of reasoning tokens internally, making CoT the core mechanism behind test-time compute scaling.",
      "fundamentals": "Prompting form: $(x, z_1, z_2, \\ldots, z_n, y)$ where $z_i$ are reasoning steps. Few-shot CoT provides exemplar traces; zero-shot CoT uses a trigger phrase. Trained CoT (reasoning models): RL optimizes $\\pi_\\theta(z_{1:T}, y | x)$ end-to-end, rewarding only final-answer correctness. Self-consistency: sample $k$ CoT paths, majority-vote on $y$, boosts accuracy at $O(k)$ inference cost. Process Reward Models (PRMs) score individual steps $r(z_i)$ to guide search over reasoning traces.",
      "related": [
        "reasoning-models",
        "test-time-compute",
        "thinking-tokens"
      ],
      "seen_in": [
        "model-cards",
        "API-docs",
        "prompting-guides"
      ],
      "foundational_papers": [
        {
          "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
          "authors": "Wei et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2201.11903"
        },
        {
          "title": "Large Language Models are Zero-Shot Reasoners",
          "authors": "Kojima et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2205.11916"
        },
        {
          "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
          "authors": "Wang et al.",
          "venue": "ICLR 2023",
          "arxiv": "2203.11171"
        },
        {
          "title": "Let's Verify Step by Step (PRM800K)",
          "authors": "Lightman et al.",
          "venue": "ICLR 2024",
          "arxiv": "2305.20050"
        }
      ],
      "resources": [
        {
          "label": "Wei et al. CoT paper",
          "url": "https://arxiv.org/abs/2201.11903"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "chat-template": {
      "id": "chat-template",
      "name": "Chat Template",
      "expansion": "Chat Template (Prompt Format)",
      "category": "model-naming",
      "oneliner": "The specific token format a chat model expects — ChatML, Llama format, Alpaca format — getting this wrong produces garbage output from an otherwise good model.",
      "explanation": "A chat template defines how system messages, user turns, and assistant turns are formatted with special tokens before being fed to the model. Each model family uses a different format: Llama uses special header tokens, ChatML uses role tags, and Alpaca uses instruction/response markers. Using the wrong template is one of the most common deployment mistakes — the model sees unexpected tokens and produces incoherent output. HuggingFace stores the template in tokenizer_config.",
      "related": [
        "tag-instruct",
        "sft",
        "system-prompt",
        "token"
      ],
      "seen_in": [
        "model-config",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Chat templates",
          "url": "https://huggingface.co/docs/transformers/chat_templating"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "chatbot-arena": {
      "id": "chatbot-arena",
      "name": "Chatbot Arena",
      "expansion": "LMSYS Chatbot Arena",
      "category": "scaling-patterns",
      "oneliner": "A crowdsourced evaluation platform where users chat with two anonymous models side-by-side and vote for the better one, producing Elo-style rankings.",
      "explanation": "Chatbot Arena is a live evaluation platform from LMSYS where users interact with two anonymous language models simultaneously and vote for the preferred response. Votes are aggregated into an Elo-style leaderboard that ranks models by human preference. Unlike static benchmarks that test narrow skills, Arena captures real-world conversational quality across diverse user intents.",
      "related": [
        "evals",
        "mmlu",
        "llm-as-judge"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
          "authors": "Chiang et al.",
          "venue": "ICML 2024",
          "arxiv": "2403.04132"
        }
      ],
      "resources": [
        {
          "label": "Chatbot Arena",
          "url": "https://lmarena.ai"
        },
        {
          "label": "Arena leaderboard",
          "url": "https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "chunked-prefill": {
      "id": "chunked-prefill",
      "name": "Chunked Prefill",
      "expansion": "Chunked Prefill (Disaggregated Prompt Processing)",
      "category": "scaling-patterns",
      "oneliner": "Splitting long prompt processing into smaller chunks so a single large request doesn't block token generation for other users.",
      "explanation": "Chunked prefill breaks the processing of a long input prompt into smaller pieces interleaved with decode steps for other requests. Without it, a single request with a very long prompt would monopolize the GPU while other users wait. By processing the prompt in chunks, the server alternates between prefill and decode work, keeping latency low for concurrent users. This is a standard feature in vLLM and SGLang for production serving.",
      "fundamentals": "Prefill is compute-bound ($O(n^2 d)$ for attention over $n$ input tokens), while decode is bandwidth-bound ($O(Pd)$ per token). A naive scheduler runs the full prefill as one operation, blocking all decode steps. Chunked prefill splits input into chunks of size $C$ (e.g., 2048 tokens), processing one chunk per scheduler iteration. Between chunks, pending decode requests get their next token. The tradeoff: smaller $C$ = better latency for other users but slower total prefill time due to repeated KV cache loads.",
      "related": [
        "inference",
        "continuous-batching",
        "kv-cache",
        "inference-metrics"
      ],
      "seen_in": [
        "serving-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Sarathi-Serve: Chunked Prefill for Efficient LLM Inference",
          "authors": "Agrawal et al.",
          "venue": "2024",
          "arxiv": "2403.02310"
        }
      ],
      "resources": [
        {
          "label": "vLLM chunked prefill docs",
          "url": "https://docs.vllm.ai/en/latest/features/chunked_prefill.html"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "clip": {
      "id": "clip",
      "name": "CLIP",
      "expansion": "CLIP — Contrastive Language-Image Pre-training",
      "category": "layer-types",
      "oneliner": "OpenAI's model that jointly trains image and text encoders to map both modalities into a shared embedding space using contrastive learning.",
      "explanation": "CLIP is a model from OpenAI that learns to associate images with text by training on 400 million image-caption pairs. An image encoder and text encoder are trained so matching pairs produce similar embeddings while mismatched pairs produce different ones. This shared embedding space enables zero-shot image classification and image search by text. CLIP serves as the vision backbone in many multimodal models and diffusion systems like Stable Diffusion.",
      "related": [
        "vlm",
        "embeddings",
        "diffusion-models",
        "tag-embed"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Learning Transferable Visual Models From Natural Language Supervision",
          "authors": "Radford et al.",
          "venue": "ICML 2021",
          "arxiv": "2103.00020"
        }
      ],
      "resources": [
        {
          "label": "OpenAI CLIP paper",
          "url": "https://arxiv.org/abs/2103.00020"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "constitutional-ai": {
      "id": "constitutional-ai",
      "name": "Constitutional AI",
      "expansion": "Constitutional AI (CAI)",
      "category": "safety-alignment",
      "oneliner": "Anthropic's alignment approach where the model critiques and revises its own outputs against a set of written principles, reducing reliance on human preference labels.",
      "explanation": "Constitutional AI is an alignment framework from Anthropic where a model self-improves by critiquing its own responses against a written constitution of principles. Instead of collecting thousands of human preference labels, the model generates a response, then generates a critique of that response based on principles like helpfulness and harmlessness, then revises its answer. This self-critique data is used for RLHF training, replacing human annotators with AI feedback (RLAIF).",
      "related": [
        "rlhf",
        "dpo",
        "red-teaming",
        "rlaif"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Constitutional AI: Harmlessness from AI Feedback",
          "authors": "Bai et al.",
          "venue": "2022",
          "arxiv": "2212.08073"
        }
      ],
      "resources": [
        {
          "label": "Anthropic CAI paper",
          "url": "https://arxiv.org/abs/2212.08073"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "context-engineering": {
      "id": "context-engineering",
      "name": "Context Engineering",
      "expansion": "Context Engineering (Prompt Architecture)",
      "category": "scaling-patterns",
      "oneliner": "The practice of carefully structuring everything that goes into an LLM's context window — system prompts, retrieved documents, conversation history, and tool results.",
      "explanation": "Context engineering is the discipline of designing what information fills a model's context window and how it is organized. Unlike simple prompt engineering which focuses on phrasing a single question, context engineering manages the entire input architecture: system instructions, retrieved RAG documents, conversation history, tool call results, and few-shot examples.",
      "fundamentals": "Key techniques: system prompt design (persona, constraints, output format), RAG chunk selection and ordering (recency vs relevance), conversation history compression (summarization of older turns), tool result formatting (structured vs natural language), few-shot example selection (diversity, difficulty progression). Lost-in-the-middle: models attend most to the beginning and end of context, so place critical information there. Token budgeting: allocate fixed proportions of the context window to each component.",
      "related": [
        "rag",
        "context-window",
        "kv-cache",
        "prefix-caching"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "Lost in the Middle: How Language Models Use Long Contexts",
          "authors": "Liu et al.",
          "venue": "TACL 2024",
          "arxiv": "2307.03172"
        }
      ],
      "resources": [
        {
          "label": "Anthropic — Building effective agents",
          "url": "https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "context-window": {
      "id": "context-window",
      "name": "Context Window",
      "expansion": "Context Window (Maximum Sequence Length)",
      "category": "quantization-basics",
      "oneliner": "The maximum number of tokens a model can process in a single forward pass — typically 4K to 1M tokens depending on the model.",
      "explanation": "The context window is the maximum number of tokens a model can process in one session. Everything — system prompt, conversation history, documents, and response — must fit within it. Early models had 2K-4K contexts; modern models reach 128K (Llama 3.1) or 1M tokens (Qwen 2.5). Extending context requires RoPE scaling, efficient attention, and careful KV cache management. Context length is one of the most important constraints when deploying LLMs.",
      "fundamentals": "Attention cost scales as $O(n^2 d)$ where $n$ = context length, making long contexts expensive. KV cache memory scales as $O(n \\cdot L \\cdot d_{kv})$ per sequence. For Llama 3 70B at 128K context: KV cache alone is ~40 GB in fp16. Practical context is often shorter than the advertised maximum due to quality degradation at the extremes — check model cards for validated effective lengths.",
      "related": [
        "kv-cache",
        "rope",
        "yarn",
        "abf",
        "swa",
        "flash-attention",
        "size-context"
      ],
      "seen_in": [
        "model-config",
        "model-cards"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Long context models",
          "url": "https://huggingface.co/docs/transformers/llm_optims"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "continuous-batching": {
      "id": "continuous-batching",
      "name": "Continuous Batching",
      "expansion": "Continuous Batching / Iteration-Level Batching",
      "category": "scaling-patterns",
      "oneliner": "Dynamically insert/remove requests from a running batch at each generation step — eliminates wasted slots from variable-length outputs. 2-4$\\times$ throughput gain.",
      "explanation": "Continuous batching dynamically adds and removes requests from a running inference batch after each token generation step, rather than waiting for every request to finish. Short requests release their slots immediately, and waiting requests fill them. This eliminates the wasted GPU time of static batching, where short requests sit idle while long ones complete. Combined with PagedAttention, continuous batching delivers two to four times higher throughput.",
      "fundamentals": "Per-iteration scheduler: 1) Generate one token for all active requests. 2) Remove finished (EOS/max_len), free KV cache. 3) Fill freed slots with new requests (run prefill). 4) Repeat. Prefill vs decode asymmetry: prefill is compute-bound (many tokens), decode is memory-bound (one token). Advanced: chunked prefill interleaves prefill chunks with decode steps. Concrete: static batching wastes 60-80% KV memory on fragmentation; PagedAttention reduces to <4%.",
      "seen_in": [
        "serving-config",
        "documentation"
      ],
      "related": [
        "inference",
        "kv-cache",
        "paged-attention",
        "speculative-decoding",
        "token"
      ],
      "sources": [
        "Yu et al., 'Orca: A Distributed Serving System,' OSDI 2022",
        "Kwon et al., 'PagedAttention,' SOSP 2023, arXiv:2309.06180"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Orca: A Distributed Serving System for Transformer-Based Generative Models",
          "authors": "Yu et al.",
          "venue": "OSDI 2022",
          "arxiv": null
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ]
    },
    "cpt": {
      "id": "cpt",
      "name": "CPT",
      "expansion": "Continued Pre-Training (Domain-Adaptive Pre-Training)",
      "category": "training-pipeline",
      "oneliner": "Additional pre-training on domain-specific data (code, medical, legal) to inject specialized knowledge — same loss as pre-training, 2-10% of the cost.",
      "explanation": "Continued pre-training takes an existing base model and runs additional pre-training on a domain-specific corpus such as code, medical text, or legal documents. It uses the same next-token prediction objective as original pre-training but shifts the data distribution toward the target domain. The main risk is catastrophic forgetting, mitigated by mixing in general data. Examples include Code Llama, Meditron, and SaulLM.",
      "fundamentals": "Loss: identical to pre-training (autoregressive CE). Only data distribution changes. Hyperparams: lr 1e-5 to 5e-5, cosine or constant schedule. Data mixing: ~70-90% domain + 10-30% general replay.",
      "seen_in": [
        "model-name",
        "model-cards"
      ],
      "related": [
        "pre-training",
        "sft",
        "token"
      ],
      "foundational_papers": [
        {
          "title": "Don't Stop Pretraining: Adapt Pretrained Language Models to Diverse Text Domains",
          "authors": "Gururangan et al.",
          "venue": "ACL 2020",
          "arxiv": "2004.10964"
        },
        {
          "title": "Code Llama: Open Foundation Models for Code",
          "authors": "Rozière et al.",
          "venue": "2023",
          "arxiv": "2308.12950"
        },
        {
          "title": "Minerva: Solving Quantitative Reasoning Problems with Language Models",
          "authors": "Lewkowycz et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2206.14858"
        }
      ],
      "sources": [
        "Gururangan et al., arXiv:2004.10964"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "cross-attention": {
      "id": "cross-attention",
      "name": "Cross-Attention",
      "expansion": "Cross-Attention (Encoder-Decoder Attention)",
      "category": "attention-variants",
      "oneliner": "Attention between two different sequences — the decoder attending to the encoder's output in seq2seq models, or text attending to image features in VLMs.",
      "explanation": "Cross-attention is the mechanism where queries come from one sequence and keys/values come from a different sequence. In encoder-decoder models like T5, the decoder uses cross-attention to look at the encoder's output while generating each token. In vision-language models, text tokens use cross-attention over image patch embeddings to ground language in visual content.",
      "related": [
        "self-attention",
        "mha",
        "seq2seq",
        "vlm",
        "diffusion-models"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Attention Is All You Need",
          "authors": "Vaswani et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03762"
        }
      ],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "data-contamination": {
      "id": "data-contamination",
      "name": "Data Contamination",
      "expansion": "Benchmark Data Contamination",
      "category": "scaling-patterns",
      "oneliner": "When evaluation benchmark data leaks into training sets, inflating reported model scores and making comparisons unreliable.",
      "explanation": "Data contamination occurs when text from evaluation benchmarks accidentally appears in a model's training data, artificially inflating the model's scores on those benchmarks. Since LLMs are trained on massive web crawls, and many benchmarks are publicly available online, overlap is almost inevitable. Contamination makes benchmark comparisons between models unreliable — a model trained on MMLU questions will score higher without being genuinely smarter.",
      "related": [
        "evals",
        "mmlu",
        "humaneval",
        "pre-training"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "data-deduplication": {
      "id": "data-deduplication",
      "name": "Data Deduplication",
      "expansion": "Training Data Deduplication",
      "category": "datasets-recipes",
      "oneliner": "Removing duplicate or near-duplicate text from training corpora to improve model quality, reduce memorization, and lower training cost.",
      "explanation": "Data deduplication removes repeated or near-identical content from training datasets before pre-training begins. Without it, the model memorizes duplicated passages and wastes training compute on redundant data. Exact dedup removes identical documents via hashing. Fuzzy dedup uses MinHash or SimHash to find near-duplicates with small edits. The Llama 3 training pipeline removed duplicates at both document and paragraph level.",
      "related": [
        "pre-training",
        "training-recipe",
        "scaling-laws"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Deduplicating Training Data Makes Language Models Better",
          "authors": "Lee et al.",
          "venue": "ACL 2022",
          "arxiv": "2107.06499"
        }
      ],
      "resources": [
        {
          "label": "NVIDIA NeMo Curator",
          "url": "https://github.com/NVIDIA/NeMo-Curator"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "data-mixing": {
      "id": "data-mixing",
      "name": "Data Mixing",
      "expansion": "Pre-Training Data Mix (Data Recipe)",
      "category": "datasets-recipes",
      "oneliner": "The proportions of different data sources — web text, code, books, academic papers, domain text — used in a pre-training corpus. A key decision that shapes model capabilities.",
      "explanation": "Data mixing is the practice of balancing different data sources in a pre-training corpus to shape the model's capabilities. A typical mix includes web crawl data (60-80 percent), code (5-15 percent), books and academic papers (5-10 percent), and curated domain data. The mix ratios directly affect what the model is good at — more code data produces better coding ability, more math data improves reasoning. Llama 3 trained on 15 trillion tokens with carefully tuned proportions.",
      "related": [
        "pre-training",
        "training-recipe",
        "scaling-laws",
        "data-deduplication"
      ],
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Llama 3 training details",
          "url": "https://arxiv.org/abs/2407.21783"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "data-parallelism": {
      "id": "data-parallelism",
      "name": "Data Parallelism",
      "expansion": "Data Parallelism (DDP / Distributed Data Parallel)",
      "category": "scaling-patterns",
      "oneliner": "Replicating the full model on every GPU and splitting the training data across them — the simplest multi-GPU training strategy.",
      "explanation": "Data parallelism is the simplest multi-GPU training strategy. Each GPU holds a complete copy of the model and processes a different slice of the training batch. After each step, gradients are averaged across all GPUs using an AllReduce operation so every copy stays synchronized. The main limitation is memory: every GPU must fit the entire model plus optimizer states. ZeRO was invented to fix this by sharding the redundant copies across GPUs.",
      "fundamentals": "Standard DDP: each of $N$ GPUs stores full model ($P$ params), full gradients, full optimizer states. Per-GPU memory: $16P$ bytes (mixed precision Adam). Gradient sync: AllReduce of $2P$ bytes per step. Throughput scales nearly linearly with $N$ (minus communication overhead). FSDP (Fully Sharded Data Parallelism) and ZeRO eliminate the redundancy by sharding across GPUs.",
      "related": [
        "tp-pp",
        "zero",
        "full-ft"
      ],
      "seen_in": [
        "training-config",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Lilian Weng — How to Train Really Large Models",
          "url": "https://lilianweng.github.io/posts/2021-09-25-train-large/"
        },
        {
          "label": "PyTorch DDP docs",
          "url": "https://pytorch.org/docs/stable/notes/ddp.html"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "dataset-dolphin": {
      "id": "dataset-dolphin",
      "name": "Dolphin",
      "expansion": "Eric Hartford's uncensored fine-tunes",
      "category": "datasets-recipes",
      "oneliner": "High-quality instruction data with alignment refusals systematically removed. Philosophy: censorship should be user's choice. Applied across Llama, Mistral, Phi.",
      "seen_in": [
        "model-name"
      ],
      "related": [],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Eric Hartford's 'Uncensored Models' blog post",
          "url": "https://erichartford.com/uncensored-models"
        }
      ],
      "explanation": "Dolphin is an instruction-tuning dataset derived from high-quality sources like FLANv2 and OpenOrca, with alignment refusals and safety filters systematically stripped out. Created by Eric Hartford, its philosophy is that censorship decisions belong to the end user, not the training data. Dolphin-tuned models follow instructions without built-in refusals, which makes them popular for unrestricted research and customization."
    },
    "dataset-evolinstruct": {
      "id": "dataset-evolinstruct",
      "name": "EvolInstruct",
      "expansion": "WizardLM synthetic data augmentation",
      "category": "datasets-recipes",
      "oneliner": "Iteratively 'evolves' simple instructions into complex ones using an LLM. Creates WizardLM/WizardCoder/WizardMath training sets. By Microsoft Research.",
      "seen_in": [
        "model-name",
        "documentation"
      ],
      "related": [],
      "foundational_papers": [
        {
          "title": "WizardLM: Empowering Large Language Models to Follow Complex Instructions",
          "authors": "Xu et al.",
          "venue": "2023",
          "arxiv": "2304.12244"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "Evol-Instruct is a data synthesis method from Microsoft Research that uses an LLM to iteratively rewrite simple instructions into progressively more complex versions. Each evolution step adds constraints, deepens reasoning, or broadens scope. This technique generates the training data behind WizardLM and WizardCoder, producing models that handle nuanced, multi-step prompts far better than models trained on basic instruction pairs."
    },
    "dataset-hermes": {
      "id": "dataset-hermes",
      "name": "Hermes / OpenHermes",
      "expansion": "NousResearch instruction-tuning dataset + fine-tunes",
      "category": "datasets-recipes",
      "oneliner": "~1M diverse instruction examples (OpenHermes-2.5). ChatML format. Strong instruction-following + function calling. By Teknium/NousResearch.",
      "seen_in": [
        "model-name",
        "dataset-name"
      ],
      "related": [
        "org-nousresearch"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "OpenHermes-2.5 dataset",
          "url": "https://huggingface.co/datasets/teknium/OpenHermes-2.5"
        }
      ],
      "explanation": "OpenHermes is a large-scale instruction-tuning dataset containing roughly one million diverse examples formatted in ChatML. Created by Teknium and NousResearch, it covers general knowledge, coding, roleplay, and function calling. Models fine-tuned on Hermes consistently rank well on community benchmarks, making it one of the most widely used open instruction datasets for training capable chat models."
    },
    "dataset-orca": {
      "id": "dataset-orca",
      "name": "Orca",
      "expansion": "Microsoft's explanation-tuned models",
      "category": "datasets-recipes",
      "oneliner": "Smaller models learn from GPT-4's step-by-step reasoning traces, not just input-output pairs. 'Explanation tuning.' Orca 2 teaches different strategies per task.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "knowledge-distillation"
      ],
      "foundational_papers": [
        {
          "title": "Orca: Progressive Learning from Complex Explanation Traces of GPT-4",
          "authors": "Mukherjee et al.",
          "venue": "2023",
          "arxiv": "2306.02707"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "The Orca dataset captures detailed step-by-step reasoning traces from GPT-4, not just final answers. Smaller models trained on these traces learn the thinking process itself, a technique Microsoft Research calls explanation tuning. Orca-series models showed that a 13-billion-parameter model could match GPT-3.5 on many benchmarks by imitating how a stronger model reasons through problems rather than memorizing outputs."
    },
    "dataset-platypus": {
      "id": "dataset-platypus",
      "name": "Platypus",
      "expansion": "STEM/logic fine-tuning dataset (Boston University)",
      "category": "datasets-recipes",
      "oneliner": "~25K curated STEM reasoning questions. Platypus2-70B briefly topped Open LLM Leaderboard (Aug 2023). Small data, big results.",
      "seen_in": [
        "model-name"
      ],
      "related": [],
      "foundational_papers": [
        {
          "title": "Platypus: Quick, Cheap, and Powerful Refinement of LLMs",
          "authors": "Lee et al.",
          "venue": "2023",
          "arxiv": "2308.07317"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "Open-Platypus is a carefully curated collection of roughly 25,000 STEM reasoning questions drawn from sources like ScienceQA and MATH benchmarks. Despite its small size, Platypus2-70B briefly topped the Hugging Face Open LLM Leaderboard in August 2023, demonstrating that high-quality data curation can outperform brute-force scale. It focuses on logical and scientific reasoning rather than broad instruction-following."
    },
    "dataset-ultrachat": {
      "id": "dataset-ultrachat",
      "name": "UltraChat / UltraFeedback",
      "expansion": "Large-scale instruction + preference datasets (Tsinghua)",
      "category": "datasets-recipes",
      "oneliner": "UltraChat: ~1.5M multi-turn dialogues for SFT. UltraFeedback: ~64K instructions with 4 responses scored by GPT-4 for DPO. Used to train Zephyr.",
      "seen_in": [
        "dataset-name",
        "model-cards"
      ],
      "related": [
        "sft",
        "dpo"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "UltraChat dataset",
          "url": "https://huggingface.co/datasets/stingning/ultrachat"
        },
        {
          "label": "UltraFeedback dataset",
          "url": "https://huggingface.co/datasets/openbmb/UltraFeedback"
        }
      ],
      "explanation": "UltraChat is a synthetic dataset of approximately 1.5 million multi-turn conversations designed for supervised fine-tuning of chat models. Its companion, UltraFeedback, contains around 64,000 instructions each paired with four model responses scored by GPT-4 for helpfulness and quality, making it ideal for Direct Preference Optimization. Together they power models like Zephyr-7B-beta."
    },
    "dense-models": {
      "id": "dense-models",
      "name": "Dense Models",
      "expansion": "Dense Model (all parameters active per token)",
      "category": "scaling-patterns",
      "oneliner": "100% of parameters active for every token — the standard transformer design. Simpler than MoE but compute scales linearly with param count.",
      "explanation": "A dense model is a standard transformer where every parameter participates in every forward pass, meaning all attention heads and feed-forward layers process every token. The term exists to distinguish from Mixture-of-Experts models that activate only a subset of parameters per token. Dense models are simpler to train, serve, and fine-tune, with no routing or load-balancing complexity. They are preferred when infrastructure simplicity matters.",
      "fundamentals": "Per-token FLOPS $\\approx$ 2 $\\times$ total_params (rough approximation). 70B dense model: ~140 TFLOPS/token. Same capacity as MoE ~280B total/70B active — same compute, 4$\\times$ less memory. When model card lists single param count without total/active distinction → dense. Examples: LLaMA 3 8B/70B/405B, Mistral 7B, Qwen 2.5 72B, Gemma 31B.",
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "related": [
        "moe",
        "token"
      ],
      "sources": [
        "Vaswani et al., 'Attention Is All You Need,' 2017",
        "Hoffmann et al., 'Training Compute-Optimal Large Language Models (Chinchilla),' arXiv:2203.15556"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Chinchilla scaling laws paper",
          "url": "https://arxiv.org/abs/2203.15556"
        },
        {
          "label": "LLaMA paper (over-training approach)",
          "url": "https://arxiv.org/abs/2302.13971"
        }
      ]
    },
    "dequantization": {
      "id": "dequantization",
      "name": "Dequantization",
      "expansion": "Inverse Quantization",
      "category": "quantization-basics",
      "oneliner": "Reverse mapping from low-precision quantized values back to approximate floats — happens on-the-fly during inference.",
      "explanation": "Dequantization converts low-precision quantized weights back to a higher-precision format during inference so they can participate in standard matrix multiplication. In a 4-bit scheme, integer weights are read from memory at 4 bits, multiplied by their group scale factor, and fed directly into 16-bit math. This is fused with the computation so only 4 bits per weight travel across the memory bus, saving bandwidth without materializing the full-precision weight matrix.",
      "fundamentals": "Integer: W_approx = scale $\\times$ (W_q - zero_point). Error: |W_original - W_approx| ≤ scale/2. nf4: W_approx = nf4_codebook[index] $\\times$ absmax. W8A8 output: Y_fp16 = Y_int32 $\\times$ (s_x $\\times$ s_w).",
      "seen_in": [
        "documentation"
      ],
      "related": [
        "inference",
        "nf4",
        "quantization",
        "w4a16",
        "w8a8"
      ],
      "sources": [
        "Frantar et al., 'Marlin,' arXiv:2408.11743"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Marlin kernel paper (fused dequantization)",
          "url": "https://arxiv.org/abs/2408.11743"
        }
      ]
    },
    "diffusion-models": {
      "id": "diffusion-models",
      "name": "Diffusion Models",
      "expansion": "Denoising Diffusion Probabilistic Models",
      "category": "layer-types",
      "oneliner": "Generative models that create images by learning to reverse a gradual noising process — the architecture behind Stable Diffusion, DALL-E, and Flux.",
      "explanation": "Diffusion models generate images by learning to reverse a process that gradually adds noise to data. Training teaches a neural network to predict and remove noise at each step. At generation time, the model starts from pure random noise and iteratively denoises it into a coherent image, guided by a text prompt via cross-attention with a text encoder like CLIP or T5.",
      "fundamentals": "Forward process: $x_t = \\sqrt{\\bar{\\alpha}_t} x_0 + \\sqrt{1-\\bar{\\alpha}_t} \\epsilon$ adds Gaussian noise over $T$ steps. Reverse process: a U-Net (or DiT transformer) predicts $\\epsilon_\\theta(x_t, t, c)$ where $c$ is the conditioning (text embedding). Loss: $\\|\\epsilon - \\epsilon_\\theta(x_t, t, c)\\|^2$. Latent diffusion: encode image to latent $z = E(x)$, diffuse in latent space, decode $\\hat{x} = D(\\hat{z})$. Classifier-free guidance scales the conditioning signal for quality vs diversity.",
      "related": [
        "self-attention",
        "backbone"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Denoising Diffusion Probabilistic Models",
          "authors": "Ho et al.",
          "venue": "NeurIPS 2020",
          "arxiv": "2006.11239"
        },
        {
          "title": "High-Resolution Image Synthesis with Latent Diffusion Models",
          "authors": "Rombach et al.",
          "venue": "CVPR 2022",
          "arxiv": "2112.10752"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — What are Diffusion Models?",
          "url": "https://lilianweng.github.io/posts/2021-07-11-diffusion-models/"
        },
        {
          "label": "Stable Diffusion on HuggingFace",
          "url": "https://huggingface.co/docs/diffusers"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "dora": {
      "id": "dora",
      "name": "DoRA",
      "expansion": "Weight-Decomposed Low-Rank Adaptation",
      "category": "fine-tuning-methods",
      "oneliner": "Decompose weights into magnitude + direction, apply LoRA to direction only. More like full FT behavior, negligible extra params.",
      "explanation": "Weight-Decomposed Low-Rank Adaptation splits each weight matrix into a magnitude component and a direction component, then applies LoRA only to the direction while training magnitude separately. This is motivated by the finding that full fine-tuning primarily changes weight direction while standard LoRA couples direction and magnitude changes suboptimally. DoRA consistently outperforms LoRA with minimal additional parameters and is available in PEFT.",
      "fundamentals": "W' = m·(V+($\\alpha$/r)·BA)/$\\|\\|V+($\\alpha$/r)·BA\\|\\|$_c. Extra params: d_out per layer. LLaMA 7B: 0.004% overhead. Based on weight normalization.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "full-ft",
        "lora",
        "peft"
      ],
      "foundational_papers": [
        {
          "title": "DoRA: Weight-Decomposed Low-Rank Adaptation",
          "authors": "Liu et al.",
          "venue": "ICML 2024",
          "arxiv": "2402.09353"
        },
        {
          "title": "Weight Normalization: A Simple Reparameterization to Accelerate Training",
          "authors": "Salimans & Kingma",
          "venue": "NeurIPS 2016",
          "arxiv": "1602.07868"
        }
      ],
      "sources": [
        "Liu et al., arXiv:2402.09353"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "dpo": {
      "id": "dpo",
      "name": "DPO",
      "expansion": "Direct Preference Optimization",
      "category": "training-pipeline",
      "oneliner": "Reparameterizes RLHF into a classification loss on preference pairs — no reward model, no PPO, 2 models in memory. The default open-source alignment method.",
      "explanation": "Direct Preference Optimization turns reinforcement learning from human feedback into a simple supervised loss that trains directly on preferred and rejected response pairs, eliminating the separate reward model and PPO training. This makes alignment two to five times cheaper while producing comparable results. DPO has become the default alignment method for open-source models including Zephyr, Tulu, and Llama 3.1.",
      "fundamentals": "Loss: $\\mathcal{L} = -\\mathbb{E}\\left[\\log \\sigma\\left(\\beta \\log \\frac{\\pi_\\theta(y_w|x)}{\\pi_{\\text{ref}}(y_w|x)} - \\beta \\log \\frac{\\pi_\\theta(y_l|x)}{\\pi_{\\text{ref}}(y_l|x)}\\right)\\right]$. $\\beta$ typically 0.1-0.5. lr 1e-6 to 5e-7. Same preference data as RLHF RM.",
      "seen_in": [
        "model-name",
        "model-cards"
      ],
      "related": [
        "rlhf",
        "sft",
        "simpo",
        "orpo",
        "kto"
      ],
      "foundational_papers": [
        {
          "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
          "authors": "Rafailov et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2305.18290"
        },
        {
          "title": "Zephyr: Direct Distillation of LM Alignment",
          "authors": "Tunstall et al.",
          "venue": "2023",
          "arxiv": "2310.16944"
        },
        {
          "title": "A General Theoretical Paradigm to Understand Learning from Human Feedback (IPO)",
          "authors": "Azar et al.",
          "venue": "AISTATS 2024",
          "arxiv": "2310.12036"
        }
      ],
      "sources": [
        "Rafailov et al., arXiv:2305.18290"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Cameron Wolfe — Direct Preference Optimization",
          "url": "https://cameronrwolfe.substack.com/p/direct-preference-optimization"
        },
        {
          "label": "HuggingFace TRL docs",
          "url": "https://huggingface.co/docs/trl/en/index"
        }
      ]
    },
    "dspy": {
      "id": "dspy",
      "name": "DSPy",
      "expansion": "DSPy — Declarative Self-improving Language Programs",
      "category": "agents-and-tools",
      "oneliner": "A framework from Stanford that replaces hand-written prompts with modular, optimizable programs where the framework automatically tunes prompts and few-shot examples.",
      "explanation": "DSPy is a programming framework that treats LLM interactions as optimizable modules rather than fixed prompt templates. You define what each step should accomplish using Python signatures, and DSPy automatically searches for the best prompts, few-shot examples, and even fine-tuning strategies to maximize a metric you specify. This eliminates brittle prompt engineering by making prompts a learnable parameter.",
      "fundamentals": "Core abstractions: Signature (input/output spec), Module (a composable step like ChainOfThought or ReAct), Teleprompter/Optimizer (searches for optimal prompts). Optimization: given a training set and metric, the optimizer evaluates candidate prompts via few-shot bootstrap, random search, or Bayesian optimization. Output: a compiled program with frozen, optimized prompts. Philosophy: prompts are like weights — they should be learned, not hand-written.",
      "related": [
        "agentic-ai",
        "chain-of-thought",
        "rag",
        "function-calling"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
          "authors": "Khattab et al.",
          "venue": "ICLR 2024",
          "arxiv": "2310.03714"
        }
      ],
      "resources": [
        {
          "label": "DSPy GitHub",
          "url": "https://github.com/stanfordnlp/dspy"
        },
        {
          "label": "DSPy docs",
          "url": "https://dspy.ai"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "e2m1": {
      "id": "e2m1",
      "name": "E2M1 (MXFP4)",
      "expansion": "2-bit Exponent, 1-bit Mantissa — Microscaling FP4",
      "category": "precision-formats",
      "oneliner": "4-bit micro-float: only 15 distinct values, made viable by sharing an 8-bit block scale across 32 elements. Next-gen format for Blackwell GPUs.",
      "explanation": "E2M1 is a tiny 4-bit floating-point format with 2 exponent bits and 1 mantissa bit, capable of representing only 15 distinct values. It becomes practical through the Microscaling specification, which groups 32 elements under a single shared 8-bit scale factor. The scale handles overall magnitude while individual elements capture relative differences. This format delivers double the compute density of 8-bit and is supported natively on NVIDIA Blackwell GPUs.",
      "fundamentals": "E2M1: 1 sign | 2 exp (bias 1) | 1 mantissa. No inf/NaN. Positive values: 0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0. With sign: 15 distinct values. E8M0 block scale: pure power of 2 (2^(exp-127)), range 2^-127 to $2^{127}$. Effective value = E2M1_value $\\times$ 2^(shared_exp - 127). Effective bpw: 4 + 8/32 = 4.25 bits.",
      "seen_in": [
        "gguf-filename",
        "hardware-spec"
      ],
      "related": [
        "fp8",
        "mxfp4-moe",
        "quantization"
      ],
      "sources": [
        "OCP Microscaling (MX) Spec v1.0, September 2023",
        "Rouhani et al., arXiv:2310.10537"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "OCP Microscaling Formats (MX) Specification v1.0",
          "authors": "OCP (AMD, ARM, Intel, Meta, Microsoft, NVIDIA, Qualcomm)",
          "venue": "September 2023",
          "arxiv": null
        },
        {
          "title": "Microscaling Data Formats for Deep Learning",
          "authors": "Rouhani et al.",
          "venue": "2023",
          "arxiv": "2310.10537"
        }
      ]
    },
    "eetq": {
      "id": "eetq",
      "name": "EETQ",
      "expansion": "Easy and Efficient Quantization for Transformers",
      "category": "quantization-methods",
      "oneliner": "Simple 8-bit weight-only quantization — drop-in replacement with minimal setup. No calibration needed.",
      "explanation": "EETQ is a lightweight quantization library that applies per-channel 8-bit integer compression to model weights with no calibration data required. It uses straightforward absmax scaling on each output channel, keeping the implementation simple and fast. At int8 precision the quality differences between methods are small, so the simplicity is a practical advantage. It plugs directly into HuggingFace Transformers for easy deployment.",
      "fundamentals": "Per-channel symmetric quantization: scale_j = max(|W[j,:]|) / 127. W_int8[j,:] = round(W[j,:] / scale_j). At inference: dequantize to fp16 and compute. That's it.",
      "seen_in": [
        "quantization-config"
      ],
      "related": [
        "calibration-data",
        "fp16",
        "inference",
        "int8",
        "quantization"
      ],
      "sources": [
        "github.com/NetEase-FuXi/EETQ"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "embeddings": {
      "id": "embeddings",
      "name": "Embeddings",
      "expansion": "Text Embeddings (Dense Vector Representations)",
      "category": "embeddings-retrieval",
      "oneliner": "Dense vector representations of text that capture semantic meaning, enabling similarity search, clustering, and retrieval-augmented generation.",
      "explanation": "Embeddings are fixed-length vectors that encode the semantic meaning of text in a high-dimensional space. Similar texts produce similar vectors, enabling semantic search — finding relevant documents by meaning rather than keyword overlap. Embedding models (like BGE, GTE, Nomic Embed) are distinct from generative models: they output a single vector per input rather than generating text.",
      "related": [
        "rag",
        "tag-embed",
        "tag-mrl",
        "token"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "MTEB Leaderboard",
          "url": "https://huggingface.co/spaces/mteb/leaderboard"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "evals": {
      "id": "evals",
      "name": "Evals",
      "expansion": "LLM Evaluations (Benchmarks and Assessment)",
      "category": "scaling-patterns",
      "oneliner": "The practice of systematically measuring LLM capabilities using benchmarks, human judgment, and automated scoring to compare models and detect regressions.",
      "explanation": "Evals are systematic tests that measure what a language model can and cannot do. They range from standardized benchmarks like MMLU and HumanEval to custom task-specific tests and human preference ratings. Good evals are the foundation of responsible model development — they detect capability regressions, compare training recipes, and surface safety issues.",
      "fundamentals": "Major eval categories: knowledge (MMLU), coding (HumanEval, MBPP), reasoning (GPQA, ARC), math (GSM8K, MATH), safety (TruthfulQA, BBQ), instruction-following (IFEval, AlpacaEval, MT-Bench). Metrics: accuracy, pass@k, Elo rating (Chatbot Arena), perplexity. Contamination: overlap between eval data and training data inflates scores. Saturation: when frontier models hit 90%+, the benchmark loses discriminating power.",
      "related": [
        "mmlu",
        "humaneval",
        "perplexity",
        "pre-training"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Open LLM Leaderboard",
          "url": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard"
        },
        {
          "label": "Chatbot Arena",
          "url": "https://lmarena.ai"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "exl2": {
      "id": "exl2",
      "name": "EXL2",
      "expansion": "ExLlamaV2 Quantization Format",
      "category": "quantization-methods",
      "oneliner": "Variable-bitrate format mixing 2-8 bit precision across layers — target any average bpw (e.g., 3.5, 4.65). Best quality-per-bit on consumer GPUs.",
      "explanation": "EXL2 is a mixed-precision quantization format native to the ExLlamaV2 inference engine. It allocates more bits to sensitive layers and fewer bits to robust ones, rather than using the same bitwidth everywhere. A measurement pass records error at multiple precisions per layer, then an optimization step distributes bits to minimize overall quality loss under a target file size.",
      "fundamentals": "Measurement: quantize each layer at 2,3,4,5,6,8 bits, record $\\|\\|WX - Q_b(W)X\\|\\|^2$. Solving: min max_i error_i(b_i) s.t. $\\Sigma$ size_i(b_i) = target. Column-level mixing via Hessian diagonal (proxy for column importance). Validation: post-quant perplexity < 30 = success.",
      "seen_in": [
        "model-name",
        "repo-name"
      ],
      "related": [
        "gptq",
        "inference",
        "inference-engine",
        "quantization",
        "tool-exllamav2"
      ],
      "sources": [
        "github.com/turboderp-org/exllamav2 (no paper; documented in repo wiki)"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "expert-parallelism": {
      "id": "expert-parallelism",
      "name": "Expert Parallelism",
      "expansion": "Expert Parallelism (EP)",
      "category": "scaling-patterns",
      "oneliner": "Distribute MoE experts across GPUs so each device holds a subset of experts, enabling models with hundreds of experts to scale beyond single-node memory.",
      "explanation": "Expert parallelism is a distribution strategy for Mixture-of-Experts models where different experts in each MoE layer are placed on different GPUs. When a token is routed to a particular expert, it is sent via all-to-all communication to the GPU hosting that expert, processed, and returned. This complements tensor parallelism and pipeline parallelism as a third axis of model distribution. DeepSeek-V3 relies heavily on expert parallelism to serve 256 routed experts.",
      "fundamentals": "Given $E$ experts and $G$ GPUs, each GPU holds $E/G$ experts. Forward pass: 1) router selects top-$k$ experts per token; 2) all-to-all dispatch sends tokens to correct GPUs; 3) each GPU runs its local experts on received tokens; 4) all-to-all combine returns results. Communication volume: $O(B \\times k \\times d / G)$ per layer, where $B$ = tokens in batch, $d$ = hidden dim. Can combine with TP (split each expert) and PP (split layers). Load balancing is critical: skewed routing wastes GPU cycles on idle experts.",
      "related": [
        "tp-pp",
        "moe",
        "expert-routing"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding",
          "authors": "Lepikhin et al.",
          "venue": "ICLR 2021",
          "arxiv": "2006.16668"
        },
        {
          "title": "MegaScale-Infer: Serving Mixture-of-Experts at Scale with Disaggregated Expert Parallelism",
          "authors": "Jiang et al.",
          "venue": "arXiv 2025",
          "arxiv": "2504.02263"
        }
      ],
      "resources": [
        {
          "label": "LMSYS: Large-scale EP for DeepSeek",
          "url": "https://www.lmsys.org/blog/2025-05-05-large-scale-ep/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "expert-routing": {
      "id": "expert-routing",
      "name": "Expert Routing / Gating",
      "expansion": "Expert Routing (Gating Network)",
      "category": "scaling-patterns",
      "oneliner": "The learned mechanism assigning tokens to experts — linear projection → softmax → top-k selection. Load balancing is the central challenge.",
      "explanation": "Expert routing is the learned mechanism inside a Mixture-of-Experts layer that decides which expert sub-networks process each token. A small gating network scores all experts for each token, then selects the top-k highest-scoring ones. The main challenge is load balancing: without regularization, the router collapses to using only a few popular experts while the rest go unused. Solutions include auxiliary balance losses and the shared-expert design used in DeepSeek.",
      "fundamentals": "Top-k: $\\text{scores} = W_g \\cdot x$, $\\text{probs} = \\text{softmax}(\\text{scores})$, select top-k, output = $\\sum w_i \\cdot \\text{Expert}_i(x)$. Noisy gating: scores += $\\text{softplus}(W_{\\text{noise}} \\cdot x) \\cdot \\varepsilon$. Load balance loss: $L = \\alpha \\cdot N \\cdot \\sum_i (f_i \\cdot p_i)$. Expert choice (Zhou 2022): each expert picks top-c tokens from batch — perfect balance by construction. DeepSeek shared+routed: $y = \\text{Expert}_{\\text{shared}}(x) + \\sum_{i \\in \\text{TopK}} g_i \\cdot \\text{Expert}_{\\text{routed},i}(x)$. DeepSeek-V3 bias-based balance: $\\text{routing\\_score}_i = (W_g \\cdot x)_i + b_i$, biases updated from running load stats.",
      "seen_in": [
        "model-config",
        "code"
      ],
      "related": [
        "moe",
        "token"
      ],
      "sources": [
        "Shazeer et al., 'Outrageously Large Neural Networks,' ICLR 2017",
        "Zhou et al., 'Mixture-of-Experts with Expert Choice Routing,' NeurIPS 2022",
        "DeepSeek-AI, 'DeepSeek-V2,' arXiv:2405.04434"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Mixture-of-Experts with Expert Choice Routing",
          "authors": "Zhou et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2202.09368"
        }
      ],
      "resources": [
        {
          "label": "Cameron Wolfe — MoE LLMs",
          "url": "https://cameronrwolfe.substack.com/p/moe-llms"
        }
      ]
    },
    "faiss": {
      "id": "faiss",
      "name": "FAISS",
      "expansion": "FAISS — Facebook AI Similarity Search",
      "category": "embeddings-retrieval",
      "oneliner": "Meta's open-source library for efficient similarity search and clustering of dense vectors — the most widely used engine for nearest-neighbor retrieval in RAG pipelines.",
      "explanation": "FAISS is a library from Meta for searching through large collections of dense vectors using approximate nearest neighbor algorithms. It powers the retrieval step in most RAG systems, finding the document chunks most semantically similar to a user's query. FAISS supports multiple index types trading accuracy for speed: flat (exact but slow), IVF (inverted file for faster approximate search), and HNSW (graph-based for high recall). It runs on CPU and GPU, handling billions of vectors.",
      "related": [
        "vector-database",
        "embeddings",
        "rag"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "FAISS GitHub",
          "url": "https://github.com/facebookresearch/faiss"
        },
        {
          "label": "FAISS tutorial",
          "url": "https://github.com/facebookresearch/faiss/wiki"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "few-shot": {
      "id": "few-shot",
      "name": "Few-Shot Prompting",
      "expansion": "Few-Shot Prompting (In-Context Learning)",
      "category": "prompting",
      "oneliner": "Providing a handful of input-output examples in the prompt so the model learns the desired task format and behavior without any weight updates.",
      "explanation": "Few-shot prompting provides 2-10 labeled examples in the prompt to demonstrate the desired task before asking the model to perform it on a new input. The model adapts to the task purely from the examples in its context window, without any training or weight changes. This in-context learning ability emerged with scale — it works reliably in models above a few billion parameters. The quality and diversity of examples matters significantly.",
      "related": [
        "prompt-engineering",
        "zero-shot",
        "chain-of-thought",
        "transfer-learning"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "Language Models are Few-Shot Learners",
          "authors": "Brown et al.",
          "venue": "NeurIPS 2020",
          "arxiv": "2005.14165"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — Prompt Engineering",
          "url": "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "ffn": {
      "id": "ffn",
      "name": "FFN / MLP",
      "expansion": "Feed-Forward Network / Multi-Layer Perceptron",
      "category": "layer-types",
      "oneliner": "The position-wise fully connected sub-block in each transformer layer — up-project, activate, down-project. Stores ~67% of each layer's parameters.",
      "explanation": "The feed-forward network is the component of each transformer layer that processes tokens independently with no cross-token interaction. It projects each token to a larger intermediate dimension, applies an activation function, and projects back down. The FFN holds roughly two-thirds of each layer's total parameters and serves as the model's primary knowledge store. Modern LLMs use SwiGLU activation, which adds a third projection matrix.",
      "fundamentals": "Standard: FFN(x) = $W_2$·activation($W_1$·x). Gated (SwiGLU): FFN(x) = (Swish(xW_gate) ⊙ xW_up)W_down. LLaMA 7B: d_model=4096, d_ff=11008. FFN params: 3$\\times$4096$\\times$11008 = 135M vs Attention: 4$\\times$409$6^2$ = 67M. The '2/3 of 4x' rule: d_ff $\\approx$ (8/3)$\\times$d_model, rounded to multiple of 256 for hardware efficiency.",
      "seen_in": [
        "model-config",
        "code"
      ],
      "related": [
        "geglu",
        "moe",
        "residual-connection",
        "swiglu",
        "token"
      ],
      "sources": [
        "Vaswani et al., 'Attention Is All You Need,' NeurIPS 2017, arXiv:1706.03762"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Attention Is All You Need (original Transformer)",
          "url": "https://arxiv.org/abs/1706.03762"
        },
        {
          "label": "Sebastian Raschka — LLM Architecture Gallery",
          "url": "https://sebastianraschka.com/llm-architecture-gallery/"
        }
      ]
    },
    "fine-tuning": {
      "id": "fine-tuning",
      "name": "Fine-Tuning",
      "expansion": "Fine-Tuning (Model Adaptation)",
      "category": "fine-tuning-methods",
      "oneliner": "The general practice of further training a pre-trained model on task-specific data — encompasses full fine-tuning, LoRA, QLoRA, and all other adaptation methods.",
      "explanation": "Fine-tuning is the umbrella term for adapting a pre-trained language model to a specific task or domain by continuing training on specialized data. It ranges from full fine-tuning (updating every parameter) to parameter-efficient methods like LoRA (updating less than 1 percent of weights). Fine-tuning is what transforms a generic base model into a code assistant, medical advisor, or customer service bot.",
      "related": [
        "lora",
        "qlora",
        "full-ft",
        "sft",
        "adapters",
        "peft",
        "transfer-learning"
      ],
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Fine-tune a pretrained model",
          "url": "https://huggingface.co/docs/transformers/training"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "flash-attention": {
      "id": "flash-attention",
      "name": "Flash Attention (FA / FA2 / FA3)",
      "expansion": "Flash Attention — IO-aware tiling algorithm for exact attention",
      "category": "attention-variants",
      "oneliner": "Computes exact attention without materializing the N$\\times$N matrix in GPU HBM — uses tiling and online softmax to reduce memory from $O($N^2$)$ to $O(N)$, achieving 2-4$\\times$ speedups.",
      "explanation": "Flash Attention is an algorithm that computes exact standard attention without materializing the full attention matrix in GPU main memory. It tiles the query, key, and value matrices into small blocks that fit in fast on-chip SRAM, accumulating results with an online softmax trick. The output is mathematically identical to standard attention but uses far less memory and runs significantly faster. It is used by default in most modern inference and training frameworks.",
      "fundamentals": "Standard IO: Θ(N·d + $N^2$) HBM accesses. Flash Attention IO: Θ($N^2$·$d^2$/M) where M=SRAM size. For typical M>>$d^2$, significant reduction. Tiling: for each Q block, iterate K/V blocks. Maintain running max m and sum l for online softmax. Rescale output O as new blocks processed. Memory: $O(N)$ — only Q,K,V,O in HBM, no N$\\times$N matrix. Backward pass: recomputes attention from Q,K,V (gradient checkpointing). Versions: FA1 ~120 TFLOPS A100, FA2 ~230 TFLOPS A100, FA3 ~740 TFLOPS H100.",
      "seen_in": [
        "code",
        "model-config",
        "serving-config"
      ],
      "related": [
        "inference",
        "kv-cache",
        "paged-attention",
        "sdpa"
      ],
      "sources": [
        "Dao et al., 'FlashAttention,' NeurIPS 2022, arXiv:2205.14135",
        "Dao, 'FlashAttention-2,' 2023, arXiv:2307.08691",
        "Shah & Dao et al., 'FlashAttention-3,' 2024, arXiv:2407.08691"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness",
          "authors": "Dao et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2205.14135"
        },
        {
          "title": "FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning",
          "authors": "Dao",
          "venue": "2023",
          "arxiv": "2307.08691"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ]
    },
    "format-coreml": {
      "id": "format-coreml",
      "name": "CoreML",
      "expansion": "Apple's ML format for on-device inference",
      "category": "formats",
      "oneliner": ".mlmodel/.mlpackage files. Runs on Apple Neural Engine, GPU, CPU transparently. For iOS/macOS apps. Converter: coremltools.",
      "seen_in": [
        "filename"
      ],
      "related": [
        "inference",
        "org-mlx-community"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Apple Core ML docs",
          "url": "https://developer.apple.com/documentation/coreml"
        },
        {
          "label": "coremltools",
          "url": "https://github.com/apple/coremltools"
        }
      ],
      "explanation": "Core ML is Apple's on-device machine learning framework, using .mlmodel and .mlpackage file formats. It automatically dispatches inference across the Apple Neural Engine, GPU, and CPU for optimal performance on iPhones, iPads, and Macs. Developers integrate models directly into iOS and macOS apps with a few lines of Swift, and Apple's tools handle hardware-specific optimization transparently."
    },
    "format-ct2": {
      "id": "format-ct2",
      "name": "CTranslate2 (ct2)",
      "expansion": "Efficient inference engine/format by OpenNMT",
      "category": "formats",
      "oneliner": "Fast CPU inference, INT8/INT16/FP16. Originally built for translation (encoder-decoder). Less common for large decoder-only LLMs. Reduced activity since late 2024.",
      "seen_in": [
        "filename"
      ],
      "related": [
        "fp16",
        "inference",
        "inference-engine",
        "int8",
        "quantization",
        "tool-llamacpp",
        "tool-vllm"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "CTranslate2 GitHub",
          "url": "https://github.com/OpenNMT/CTranslate2"
        }
      ],
      "explanation": "CTranslate2 is a lightweight inference engine optimized for fast CPU execution with INT8, INT16, and FP16 quantization. Originally developed by OpenNMT for translation models like MarianMT and NLLB, it also supports encoder-decoder architectures such as Whisper. However, it has seen reduced development activity since late 2024 and is less commonly used for large decoder-only LLMs compared to llama.cpp or vLLM."
    },
    "format-onnx": {
      "id": "format-onnx",
      "name": "ONNX",
      "expansion": "Open Neural Network Exchange",
      "category": "formats",
      "oneliner": "Open format for ML computation graphs. Train in PyTorch, export to ONNX, run anywhere via ONNX Runtime (CPU, CUDA, DirectML, TensorRT). Key for edge/Windows.",
      "seen_in": [
        "filename",
        "model-name"
      ],
      "related": [
        "inference",
        "quantization"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "ONNX GitHub",
          "url": "https://github.com/onnx/onnx"
        },
        {
          "label": "ONNX Runtime",
          "url": "https://github.com/microsoft/onnxruntime"
        }
      ],
      "explanation": "ONNX (Open Neural Network Exchange) is an open standard for representing machine learning computation graphs. You train a model in any framework like PyTorch or TensorFlow, export it to ONNX format, and run inference anywhere using ONNX Runtime. It supports operator-level optimizations and quantization, making it a practical choice for deploying models across CPUs, GPUs, and edge devices without framework lock-in."
    },
    "format-openvino": {
      "id": "format-openvino",
      "name": "OpenVINO",
      "expansion": "Intel's inference optimization toolkit",
      "category": "formats",
      "oneliner": "Converts to OpenVINO IR format, applies INT8/INT4 quant via NNCF, hardware-specific kernels. Significantly faster than generic ONNX on Intel CPUs (AMX).",
      "seen_in": [
        "filename"
      ],
      "related": [
        "format-onnx",
        "inference",
        "int4",
        "int8",
        "quantization"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "OpenVINO GitHub",
          "url": "https://github.com/openvinotoolkit/openvino"
        }
      ],
      "explanation": "OpenVINO is Intel's toolkit for optimizing and deploying inference workloads on Intel hardware. It converts models from PyTorch or ONNX into its own Intermediate Representation format, then applies INT8 and INT4 quantization along with graph-level optimizations. The result is significantly faster inference on Intel CPUs and integrated GPUs, making it the go-to choice for deploying AI on Intel-based servers and edge devices."
    },
    "fp16": {
      "id": "fp16",
      "name": "fp16 / float16",
      "expansion": "16-bit IEEE 754 Half-Precision Floating Point",
      "category": "precision-formats",
      "oneliner": "A 16-bit float that halves memory vs fp32 but has limited dynamic range — fast for inference, needs loss scaling for training.",
      "explanation": "FP16 is a 16-bit floating-point format that halves memory compared to 32-bit, so a 7-billion parameter model fits in about 14 GB. It enables tensor core acceleration on NVIDIA GPUs for significant throughput gains. The main limitation is a small maximum value of 65,504, meaning training gradients can overflow and require loss scaling. For inference this is rarely a problem. FP16 has largely been replaced by BFloat16 for training.",
      "fundamentals": "Bit layout: 1 sign | 5 exponent (bias 15) | 10 mantissa. Value = (-1)^sign $\\times$ 2^(exp-15) $\\times$ 1.mantissa. Range: $\\pm$65,504. Precision: ~3.3 decimal digits. Compared to fp32: range drops from ~$10^{38}$ to ~6.5$\\times 10^{4}$ (34 orders of magnitude less), precision from ~7.2 to ~3.3 digits.",
      "seen_in": [
        "model-name",
        "filename"
      ],
      "related": [
        "bf16",
        "fp32",
        "inference"
      ],
      "sources": [
        "IEEE 754-2008 Section 3.6",
        "Micikevicius et al., 'Mixed Precision Training,' ICLR 2018, arXiv:1710.03740"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Mixed Precision Training",
          "authors": "Micikevicius et al.",
          "venue": "ICLR 2018",
          "arxiv": "1710.03740"
        }
      ]
    },
    "fp32": {
      "id": "fp32",
      "name": "fp32 / float32",
      "expansion": "32-bit IEEE 754 Single-Precision Floating Point",
      "category": "precision-formats",
      "oneliner": "The standard 32-bit floating-point format — baseline 'full precision' for neural networks.",
      "explanation": "FP32 is the standard 32-bit floating-point format where each parameter occupies 4 bytes, so a 7-billion parameter model requires about 28 GB. It provides excellent precision and enormous numerical range but is now considered overkill for model weights. It is still used for optimizer states and master weight copies in mixed-precision training. When people say a model is quantized, they mean converted from FP32 or BFloat16 to a smaller format.",
      "fundamentals": "Bit layout: 1 sign | 8 exponent (bias 127) | 23 mantissa. Value = (-1)^sign $\\times$ 2^(exp-127) $\\times$ 1.mantissa. The 23-bit mantissa + implicit leading 1 = 24 bits of significand $\\approx$ 7.2 decimal digits. Special values: $\\pm$inf (exp all 1s, mantissa 0), NaN (exp all 1s, mantissa ≠0), subnormals (exp all 0s).",
      "seen_in": [
        "model-weights",
        "optimizer-states"
      ],
      "related": [
        "fp16",
        "bf16",
        "quantization"
      ],
      "sources": [
        "IEEE 754-2019"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "IEEE 754-2019 Standard",
          "url": "https://ieeexplore.ieee.org/document/8766229"
        }
      ]
    },
    "fp8": {
      "id": "fp8",
      "name": "fp8 (E4M3 / E5M2)",
      "expansion": "8-bit Floating Point — two variants for forward and backward passes",
      "category": "precision-formats",
      "oneliner": "Two 8-bit float formats: E4M3 for weights/activations (more precision) and E5M2 for gradients (more range). 2$\\times$ throughput vs fp16.",
      "explanation": "FP8 is an 8-bit floating-point format available in two variants optimized for different parts of the training loop. The E4M3 variant provides more precision for forward-pass weights and activations, while E5M2 provides more dynamic range for backward-pass gradients. FP8 runs natively on NVIDIA H100 tensor cores at double the throughput of FP16, with accuracy loss typically under 0.1 percent.",
      "fundamentals": "E4M3: 1 sign | 4 exp (bias 7) | 3 mantissa. No infinity — all exp=1111 patterns are finite (deliberate departure from IEEE 754). Only 16 values per sign per binade. E5M2: 1 sign | 5 exp (bias 15) | 2 mantissa. Follows IEEE 754 conventions (has inf/NaN). Per-tensor scaling factors are essential — without them, clipping destroys quality.",
      "seen_in": [
        "model-name",
        "serving-config"
      ],
      "related": [
        "bf16",
        "fp16",
        "w8a8"
      ],
      "sources": [
        "Micikevicius et al., 'FP8 Formats for Deep Learning,' arXiv:2209.05433"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "FP8 Formats for Deep Learning",
          "authors": "Micikevicius et al.",
          "venue": "2022",
          "arxiv": "2209.05433"
        }
      ]
    },
    "full-ft": {
      "id": "full-ft",
      "name": "Full Fine-Tuning",
      "expansion": "Full Fine-Tuning (all parameters updated)",
      "category": "fine-tuning-methods",
      "oneliner": "Update every parameter — ~12N bytes memory, maximum expressiveness, risks catastrophic forgetting, produces full model copies.",
      "explanation": "Full fine-tuning is the process of updating every parameter in a model during training, giving maximum flexibility to learn new tasks or domains. It requires roughly twelve times the model size in memory to hold weights, gradients, optimizer states, and master copies. A 7B parameter model needs around 84 GB, so distributed training with DeepSpeed ZeRO or FSDP is essential. All major instruct and chat models from leading labs use full fine-tuning.",
      "fundamentals": "Memory: 7B→84GB, 13B→156GB, 70B→840GB. ZeRO-1: shard optimizer. ZeRO-2: +gradients. ZeRO-3: +parameters. lr 1e-5 to 5e-5, 1-3 epochs.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "lora",
        "sft",
        "zero"
      ],
      "foundational_papers": [
        {
          "title": "Training language models to follow instructions with human feedback (InstructGPT)",
          "authors": "Ouyang et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2203.02155"
        },
        {
          "title": "ZeRO: Memory Optimizations Toward Training Trillion Parameter Models",
          "authors": "Rajbhandari et al.",
          "venue": "SC 2020",
          "arxiv": "1910.02054"
        }
      ],
      "sources": [
        "Ouyang et al., arXiv:2203.02155"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "function-calling": {
      "id": "function-calling",
      "name": "Function Calling",
      "expansion": "LLM Function Calling / Tool Use",
      "category": "agents-and-tools",
      "oneliner": "The mechanism by which an LLM emits structured JSON to invoke external tools instead of generating free-text. Foundation of all agentic behaviour.",
      "explanation": "Function calling is the capability that lets a language model request execution of external functions by outputting structured arguments rather than plain text. The developer registers a set of tool schemas (name, description, parameter JSON Schema) in the prompt or API call. When the model decides a tool is needed, it emits a tool-call object with the function name and arguments. The harness executes the function, returns the result, and the model continues reasoning.",
      "fundamentals": "Flow: user prompt + tool schemas -> model reasons -> model emits tool_call(name, args) -> harness validates and executes -> result appended as tool_result -> model generates final answer or chains another call. Parallel tool calling lets the model emit multiple calls in one turn. Forced tool use constrains the model to always call a specific function. Structured output mode (JSON Schema) is closely related but produces data rather than invoking actions. Fine-tuning on function-calling datasets improves reliability; constrained decoding guarantees valid JSON.",
      "related": [
        "agentic-ai",
        "mcp",
        "inference-engine"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
          "authors": "Schick et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2302.04761"
        },
        {
          "title": "Gorilla: Large Language Model Connected with Massive APIs",
          "authors": "Patil et al.",
          "venue": "arXiv 2023",
          "arxiv": "2305.15334"
        }
      ],
      "resources": [
        {
          "label": "OpenAI function calling guide",
          "url": "https://platform.openai.com/docs/guides/function-calling"
        },
        {
          "label": "Anthropic tool use docs",
          "url": "https://docs.anthropic.com/en/docs/build-with-claude/tool-use"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "geglu": {
      "id": "geglu",
      "name": "GeGLU",
      "expansion": "GELU-activated Gated Linear Unit",
      "category": "layer-types",
      "oneliner": "GLU variant using GELU activation instead of Swish — performs nearly identically to SwiGLU, used in T5 v1.1 and Flan-T5.",
      "explanation": "GeGLU is a gated activation function that uses the GELU nonlinearity as its gating mechanism inside the feed-forward network. It was introduced as an alternative to ReLU and performs nearly identically to SwiGLU in quality benchmarks. The T5 and Flan-T5 model families adopted GeGLU, while the Llama ecosystem standardized on SwiGLU instead. It has become less common in decoder-only LLMs since 2023.",
      "fundamentals": "GeGLU(x) = (GELU(xW_gate) ⊙ xW_up)W_down. GELU(z) $\\approx$ z·0.5·(1 + tanh($\\sqrt{2/π}$·(z + 0.044715$z^3$))). Same parameter count as SwiGLU: 3 $\\times$ d_model $\\times$ d_ff. Swish vs GELU: $\\Phi$(x) has slightly steeper transition than $\\sigma$(x), negligible practical difference.",
      "seen_in": [
        "model-config",
        "code"
      ],
      "related": [
        "ffn",
        "swiglu"
      ],
      "sources": [
        "Shazeer, 'GLU Variants Improve Transformer,' 2020, arXiv:2002.05202",
        "Hendrycks & Gimpel, 'GELUs,' 2016, arXiv:1606.08415"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "GLU Variants Improve Transformer",
          "authors": "Shazeer",
          "venue": "2020",
          "arxiv": "2002.05202"
        },
        {
          "title": "Gaussian Error Linear Units (GELUs)",
          "authors": "Hendrycks & Gimpel",
          "venue": "2016",
          "arxiv": "1606.08415"
        }
      ]
    },
    "ggml": {
      "id": "ggml",
      "name": "GGML",
      "expansion": "Georgi Gerganov Machine Learning (library and legacy format)",
      "category": "formats",
      "oneliner": "Legacy format — predecessor to GGUF. No longer used for new models but you may encounter old repos with .ggml files.",
      "explanation": "GGML is an obsolete model file format that was originally used by llama.cpp for local inference. It stored tensor data but lacked proper metadata support, with no fields for tokenizer configuration, model architecture details, or format versioning. The GGML C library still powers tensor math in llama.cpp, but the file format was replaced by GGUF in August 2023.",
      "fundamentals": "Simple binary layout: tensor data with minimal headers. No standardized metadata — required external configuration files. No versioning — breaking changes required new tooling. GGUF solved all of these by adding a proper header with KV metadata.",
      "seen_in": [
        "legacy-repos",
        "old-filenames"
      ],
      "related": [
        "gguf",
        "inference",
        "tool-llamacpp"
      ],
      "sources": [
        "github.com/ggerganov/ggml"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "gguf": {
      "id": "gguf",
      "name": "GGUF",
      "expansion": "GGML Universal File",
      "category": "formats",
      "oneliner": "The standard single-file format for llama.cpp/Ollama/LM Studio — contains model weights, metadata, tokenizer, and quantization info in one file.",
      "explanation": "GGUF is a self-contained model file format designed for local inference with llama.cpp and compatible tools. Everything needed to load and run a model lives in a single file, including weights, tokenizer data, and architecture metadata stored as key-value pairs. It supports all common quantization types from Q2_K through Q8_0, plus IQ and UD variants. GGUF is the standard format for split CPU and GPU inference on consumer hardware.",
      "fundamentals": "File structure: 1) Magic number ('GGUF'). 2) Version. 3) Tensor count. 4) Metadata KV pairs (architecture, tokenizer, quant type, etc.). 5) Tensor descriptors (name, shape, type, offset). 6) Tensor data (aligned, mmap-friendly). Quantization type is per-tensor — different tensors can use different types. Supports memory mapping for efficient loading.",
      "seen_in": [
        "repo-name",
        "filename"
      ],
      "related": [
        "ggml",
        "inference",
        "iq-quants",
        "k-quants",
        "quantization",
        "tool-llamacpp",
        "tool-lmstudio",
        "tool-ollama"
      ],
      "sources": [
        "github.com/ggml-org/ggml/blob/master/docs/gguf.md",
        "gguf-py README"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "gpqa": {
      "id": "gpqa",
      "name": "GPQA",
      "expansion": "GPQA — Graduate-Level Google-Proof Q&A",
      "category": "scaling-patterns",
      "oneliner": "A 448-question science reasoning benchmark where non-expert humans score only 34% — designed to measure frontier model capabilities that MMLU can no longer distinguish.",
      "explanation": "GPQA is a benchmark of 448 extremely difficult science questions in biology, physics, and chemistry, written by domain experts and verified to be unsearchable. Non-expert humans with internet access score only 34 percent, while expert humans score around 65 percent. GPQA has replaced MMLU as the discriminating benchmark for frontier models because MMLU is now saturated above 88 percent. Models are evaluated on the GPQA-Diamond subset of 198 hardest questions.",
      "related": [
        "mmlu",
        "evals",
        "reasoning-models"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
          "authors": "Rein et al.",
          "venue": "2023",
          "arxiv": "2311.12022"
        }
      ],
      "resources": [
        {
          "label": "GPQA paper",
          "url": "https://arxiv.org/abs/2311.12022"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "gptq": {
      "id": "gptq",
      "name": "GPTQ",
      "expansion": "GPT-Quantization (named after target: Generative Pre-trained Transformers)",
      "category": "quantization-methods",
      "oneliner": "Post-training weight quantization using second-order (Hessian) information to compress weights to 3-4 bits with minimal accuracy loss.",
      "explanation": "GPTQ is a post-training quantization method that compresses model weights layer by layer using mathematical error correction. When one weight is rounded to a lower precision, GPTQ adjusts the remaining weights in that layer to compensate, guided by Hessian information about which errors matter most. Processing columns in blocks of 128 makes it scale to models with hundreds of billions of parameters in just a few GPU-hours.",
      "fundamentals": "Objective: $\\min \\|WX - Q(W)X\\|^2$. OBS update: when $w_q$ is quantized, distribute error via $\\delta w = -\\delta \\cdot (H^{-1}_{:,q} / H^{-1}_{q,q})$. GPTQ processes columns left-to-right with lazy batch updates per 128-column block. Complexity: $O(d_{\\text{row}} \\times d_{\\text{col}}^2)$.",
      "seen_in": [
        "model-name",
        "filename"
      ],
      "related": [
        "awq",
        "marlin",
        "ptq",
        "quantization"
      ],
      "sources": [
        "Frantar et al., 'GPTQ,' ICLR 2023, arXiv:2210.17323"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "GPTQ: Accurate Post-Training Quantization for Generative Pre-Trained Transformers",
          "authors": "Frantar et al.",
          "venue": "ICLR 2023",
          "arxiv": "2210.17323"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace — Selecting a Quantization Method",
          "url": "https://huggingface.co/docs/transformers/quantization/selecting"
        }
      ]
    },
    "gpu-memory": {
      "id": "gpu-memory",
      "name": "GPU Memory",
      "expansion": "GPU Memory (VRAM) for LLMs",
      "category": "scaling-patterns",
      "oneliner": "The GPU video memory (VRAM) available for loading model weights, KV cache, and activations — the primary hardware constraint for LLM deployment.",
      "explanation": "GPU memory (VRAM) is the single most important hardware constraint for running LLMs. Model weights, the KV cache, and activations must all fit in VRAM. Consumer GPUs like the RTX 4090 have 24 GB, datacenter GPUs like A100 have 40-80 GB, and H100s have 80 GB of fast HBM3 memory. A 70B model in fp16 needs 140 GB — more than any single GPU — requiring either quantization (4-bit reduces it to 35 GB) or multi-GPU distribution.",
      "related": [
        "model-size-memory",
        "quantization",
        "int4",
        "tp-pp",
        "inference"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "gqa": {
      "id": "gqa",
      "name": "GQA",
      "expansion": "Grouped Query Attention",
      "category": "attention-variants",
      "oneliner": "Query heads grouped into G groups, each sharing one K/V head — the dominant middle-ground between MHA quality and MQA efficiency.",
      "explanation": "Grouped Query Attention is a middle ground between standard multi-head attention and multi-query attention that groups query heads into clusters, with each group sharing one key and one value head. This reduces KV cache size without the quality loss of using a single shared head. Existing multi-head models can be converted to GQA with just 5-10 percent of the original training compute. It is the dominant attention variant in modern LLMs including Llama 2 70B, Llama 3, and Mistral.",
      "fundamentals": "For Mistral 7B: hidden_dim=4096, n_query_heads=32, n_kv_heads=8, head_dim=128, G=4. W_Q [4096, 4096], W_K [4096, 1024], W_V [4096, 1024]. Query heads 0-3 attend to KV head 0, heads 4-7 to KV head 1, etc. Implementation: K/V repeated via repeat_interleave or grouped computation. KV cache per token per layer: 2 $\\times$ 8 $\\times$ 128 $\\times$ 2 = 4 KB (4$\\times$ reduction vs MHA, 8$\\times$ larger than MQA). General formula: cache = 2 $\\times$ n_kv_heads $\\times$ head_dim $\\times$ seq_len $\\times$ n_layers $\\times$ bytes_per_param.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "kv-cache",
        "mha",
        "mla",
        "mqa",
        "token"
      ],
      "sources": [
        "Ainslie et al., 'GQA: Training Generalized Multi-Query Transformer Models,' EMNLP 2023, arXiv:2305.13245"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints",
          "authors": "Ainslie et al.",
          "venue": "EMNLP 2023",
          "arxiv": "2305.13245"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Raschka — Visual Attention Variants",
          "url": "https://magazine.sebastianraschka.com/p/visual-attention-variants"
        },
        {
          "label": "Lilian Weng — The Transformer Family v2",
          "url": "https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/"
        }
      ]
    },
    "gradient-checkpointing": {
      "id": "gradient-checkpointing",
      "name": "Gradient Checkpointing",
      "expansion": "Gradient Checkpointing (Activation Recomputation)",
      "category": "training-pipeline",
      "oneliner": "A memory optimization that trades compute for memory by recomputing activations during the backward pass instead of storing them all.",
      "explanation": "Gradient checkpointing saves GPU memory during training by discarding intermediate activations in the forward pass and recomputing them during the backward pass. Without it, a transformer stores activations from every layer, consuming tens of gigabytes. Checkpointing roughly halves activation memory at the cost of about 33 percent extra compute. It is essential for training large models on limited hardware and is enabled by a single flag in most frameworks.",
      "fundamentals": "Standard: store all $L$ layers' activations, memory $O(L \\cdot B \\cdot S \\cdot D)$ where $B$ = batch, $S$ = seq_len, $D$ = hidden_dim. With checkpointing every $\\sqrt{L}$ layers: memory $O(\\sqrt{L} \\cdot BSD)$, compute increases by ~33% (one extra forward pass). Selective checkpointing: only checkpoint attention layers (expensive to store) while keeping cheap layers (norms, residuals). Flash Attention's recomputation of attention in the backward pass is a form of selective checkpointing.",
      "related": [
        "flash-attention",
        "full-ft",
        "zero",
        "mixed-precision-training"
      ],
      "seen_in": [
        "training-config",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "Training Deep Nets with Sublinear Memory Cost",
          "authors": "Chen et al.",
          "venue": "2016",
          "arxiv": "1604.06174"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — How to Train Really Large Models",
          "url": "https://lilianweng.github.io/posts/2021-09-25-train-large/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "greedy-decoding": {
      "id": "greedy-decoding",
      "name": "Greedy Decoding",
      "expansion": "Greedy Decoding (Argmax Sampling)",
      "category": "sampling-decoding",
      "oneliner": "The simplest decoding strategy — always pick the single highest-probability token at each step. Fast and deterministic but can produce repetitive or suboptimal text.",
      "explanation": "Greedy decoding selects the token with the highest probability at each generation step, equivalent to setting temperature to 0. It produces deterministic output — the same input always gives the same output. While fast and predictable, greedy decoding often misses better sequences that would have been found by exploring lower-probability tokens early on. It tends to produce repetitive text and can get stuck in loops.",
      "related": [
        "temperature",
        "top-p",
        "top-k"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "grpo": {
      "id": "grpo",
      "name": "GRPO",
      "expansion": "Group Relative Policy Optimization",
      "category": "training-pipeline",
      "oneliner": "PPO variant that drops the critic model — estimates advantages by normalizing rewards across a group of sampled outputs for the same prompt. Powers DeepSeek R1.",
      "explanation": "Group Relative Policy Optimization is an RL algorithm that simplifies PPO by removing the critic model entirely. For each prompt, GRPO samples a group of outputs, scores them, then normalizes rewards within the group to estimate advantages. This replaces the learned value function, cutting memory by 40 to 60 percent. Introduced in DeepSeekMath, GRPO became the training algorithm for DeepSeek R1, where it trained reasoning from scratch using only correctness rewards.",
      "fundamentals": "Sample group $\\{o_1, \\ldots, o_G\\}$ from $\\pi_{\\theta_{\\text{old}}}(\\cdot|q)$. Compute rewards $r_i$. Normalize: $\\hat{A}_i = \\frac{r_i - \\mathrm{mean}(\\mathbf{r})}{\\mathrm{std}(\\mathbf{r})}$. Clipped objective per token $t$: $\\mathcal{L} = -\\frac{1}{G}\\sum_i \\frac{1}{|o_i|}\\sum_t \\min\\!\\bigl(\\rho_t \\hat{A}_i,\\; \\mathrm{clip}(\\rho_t, 1\\pm\\varepsilon)\\hat{A}_i\\bigr) + \\beta\\,\\mathrm{KL}(\\pi_\\theta \\| \\pi_{\\mathrm{ref}})$ where $\\rho_t = \\pi_\\theta(o_t|q,o_{<t})/\\pi_{\\theta_{\\text{old}}}(o_t|q,o_{<t})$. R1 settings: $G$=16, $\\varepsilon$=10, $\\beta$=0.001, lr=3e-6, max length 32K tokens, batch 512.",
      "related": [
        "rlhf",
        "dpo",
        "reasoning-models",
        "rlvr"
      ],
      "seen_in": [
        "research-papers",
        "model-cards",
        "training-recipes"
      ],
      "foundational_papers": [
        {
          "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
          "authors": "Shao, Wang, Zhu, Xu et al.",
          "venue": "2024",
          "arxiv": "2402.03300"
        },
        {
          "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
          "authors": "DeepSeek-AI",
          "venue": "2025",
          "arxiv": "2501.12948"
        }
      ],
      "resources": [
        {
          "label": "GRPO deep dive (Cameron Wolfe)",
          "url": "https://cameronrwolfe.substack.com/p/grpo"
        },
        {
          "label": "HuggingFace LLM Course: GRPO",
          "url": "https://huggingface.co/learn/llm-course/en/chapter12/3b"
        },
        {
          "label": "Sebastian Raschka — Understanding Reasoning LLMs",
          "url": "https://magazine.sebastianraschka.com/p/understanding-reasoning-llms"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "gsm8k": {
      "id": "gsm8k",
      "name": "GSM8K",
      "expansion": "GSM8K — Grade School Math 8K",
      "category": "scaling-patterns",
      "oneliner": "A benchmark of 8,500 grade-school math word problems requiring multi-step arithmetic reasoning — the standard test for basic mathematical ability in LLMs.",
      "explanation": "GSM8K is a benchmark of roughly 8,500 math word problems at a grade school level, each requiring 2-8 steps of basic arithmetic reasoning. It tests whether a model can break down a problem into steps and compute the answer correctly. GSM8K has been the standard math reasoning benchmark since its release, appearing on virtually every model card. Frontier models now score above 95 percent, so harder benchmarks like MATH and AIME are used for differentiation.",
      "related": [
        "mmlu",
        "evals",
        "chain-of-thought",
        "tag-math"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Training Verifiers to Solve Math Word Problems",
          "authors": "Cobbe et al.",
          "venue": "2021",
          "arxiv": "2110.14168"
        }
      ],
      "resources": [
        {
          "label": "GSM8K dataset",
          "url": "https://huggingface.co/datasets/openai/gsm8k"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "guardrails": {
      "id": "guardrails",
      "name": "Guardrails",
      "expansion": "LLM Guardrails (Input/Output Safety Filters)",
      "category": "agents-and-tools",
      "oneliner": "Programmable filters that check LLM inputs and outputs against safety rules, content policies, and format constraints in real time.",
      "explanation": "Guardrails are runtime filters placed around an LLM to enforce safety policies and output quality. They inspect prompts before they reach the model (blocking jailbreaks, prompt injections, off-topic requests) and check generated responses before they reach the user (filtering harmful content, enforcing format constraints, validating facts). NVIDIA NeMo Guardrails and Guardrails AI are the two main frameworks.",
      "fundamentals": "Architecture: input rail → LLM → output rail. Rails can be: regex pattern matching, classifier-based (using a small model like Llama Guard), LLM-as-judge (using a second model to evaluate), or programmatic (Pydantic schema validation). NeMo Guardrails uses Colang, a domain-specific language for defining conversational rails. Guardrails AI uses validators composable in a pipeline. Latency overhead: 50-200ms per rail depending on implementation.",
      "related": [
        "tag-guard",
        "red-teaming",
        "agentic-ai",
        "function-calling"
      ],
      "seen_in": [
        "code",
        "documentation",
        "serving-config"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "NVIDIA NeMo Guardrails",
          "url": "https://github.com/NVIDIA/NeMo-Guardrails"
        },
        {
          "label": "Guardrails AI",
          "url": "https://github.com/guardrails-ai/guardrails"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "hallucination": {
      "id": "hallucination",
      "name": "Hallucination",
      "expansion": "LLM Hallucination (Confabulation)",
      "category": "safety-alignment",
      "oneliner": "When a model generates plausible-sounding but factually incorrect or fabricated information not grounded in its training data or provided context.",
      "explanation": "Hallucination is the tendency of language models to generate confident, fluent text that is factually wrong or entirely fabricated. The model may cite nonexistent papers, invent statistics, or describe events that never happened. Hallucinations occur because LLMs are trained to produce probable text, not truthful text — they have no internal fact-checking mechanism.",
      "related": [
        "rag",
        "evals",
        "red-teaming",
        "guardrails"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "TruthfulQA benchmark",
          "url": "https://github.com/sylinrl/TruthfulQA"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "hqq": {
      "id": "hqq",
      "name": "HQQ",
      "expansion": "Half-Quadratic Quantization",
      "category": "quantization-methods",
      "oneliner": "Zero-calibration quantization — no forward passes through the model needed. Uses optimization on weight statistics alone. Supports 1-8 bit.",
      "explanation": "HQQ is a post-training quantization method that compresses model weights without needing any calibration data. It reformulates the quantization problem as a half-quadratic optimization that operates directly on weight tensors, skipping the usual step of running sample data through the model. This makes it extremely fast and eliminates failures caused by unrepresentative calibration sets. Quality is competitive with GPTQ and AWQ at 4-bit precision.",
      "fundamentals": "Treats quantization as proximal optimization: split the problem into a rounding step and a scale-fitting step, alternate. No activations or Hessian needed — operates purely on weight statistics. Supports arbitrary group sizes and asymmetric quantization.",
      "seen_in": [
        "model-name",
        "quantization-config"
      ],
      "related": [
        "awq",
        "calibration-data",
        "gptq",
        "ptq",
        "quantization"
      ],
      "sources": [
        "Badri & Shaji, 'HQQ: Half-Quadratic Quantization,' 2023, github.com/mobiusml/hqq"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Half-Quadratic Quantization of Large Language Models",
          "authors": "Badri & Shaji",
          "venue": "2023",
          "arxiv": "2309.15531"
        }
      ]
    },
    "huggingface-hub": {
      "id": "huggingface-hub",
      "name": "HuggingFace Hub",
      "expansion": "HuggingFace Hub",
      "category": "hf-organizations",
      "oneliner": "The central platform for sharing and discovering ML models, datasets, and applications — hosting over 1 million models with Git-based versioning.",
      "explanation": "HuggingFace Hub is the platform where the open-source ML community shares models, datasets, and demo applications. It hosts over one million model repositories with Git-based versioning, automatic model cards, and integration with the Transformers library for one-line loading. The Hub provides Inference API for testing models without downloading them, Spaces for hosting interactive demos, and Organizations for team collaboration.",
      "related": [
        "safetensors",
        "model-card",
        "org-meta",
        "org-mistral",
        "org-qwen"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace Hub",
          "url": "https://huggingface.co"
        },
        {
          "label": "Hub documentation",
          "url": "https://huggingface.co/docs/hub"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "humaneval": {
      "id": "humaneval",
      "name": "HumanEval",
      "expansion": "HumanEval Code Generation Benchmark",
      "category": "scaling-patterns",
      "oneliner": "A benchmark of 164 Python programming problems where models write functions from docstrings and are graded by unit test pass rate.",
      "explanation": "HumanEval is a code generation benchmark containing 164 hand-written Python programming problems. Each provides a function signature and docstring, and the model must generate a correct implementation graded by hidden unit tests. Results are reported as pass@k — the probability that at least one of k samples passes. HumanEval is the standard benchmark for code models like CodeLlama and DeepSeek-Coder.",
      "fundamentals": "The pass@k metric generates $n \\geq k$ samples per problem and computes the probability that at least one of $k$ randomly chosen samples passes. The unbiased estimator is $\\text{pass@k} = 1 - \\binom{n-c}{k} / \\binom{n}{k}$ where $c$ is the number of correct samples out of $n$. Temperature and top-p sampling settings significantly affect scores. EvalPlus extends HumanEval with 80x more test cases to catch solutions that pass original tests by luck.",
      "related": [
        "tag-coder",
        "inference"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Evaluating Large Language Models Trained on Code",
          "authors": "Chen et al.",
          "venue": "2021",
          "arxiv": "2107.03374"
        }
      ],
      "resources": [
        {
          "label": "HumanEval on GitHub",
          "url": "https://github.com/openai/human-eval"
        },
        {
          "label": "EvalPlus leaderboard",
          "url": "https://evalplus.github.io/leaderboard.html"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "ia3": {
      "id": "ia3",
      "name": "IA3",
      "expansion": "Infused Adapter by Inhibiting and Amplifying Inner Activations",
      "category": "fine-tuning-methods",
      "oneliner": "Learns rescaling vectors (not matrices) for K, V, FFN activations — 10-100$\\times$ fewer params than LoRA. Best for few-shot and massive multi-task serving.",
      "explanation": "IA3 is a parameter-efficient fine-tuning method that learns element-wise scaling vectors for a model's key, value, and feed-forward layers. Instead of adding new weight matrices, it multiplies existing activations by learned scale factors initialized to one, changing model behavior with a tiny number of new parameters. A 7B model needs only about 614K trainable parameters. It excels at few-shot tasks and serving thousands of task-specific adapters simultaneously.",
      "fundamentals": "K' = l_k ⊙ (W_K·x). V' = l_v ⊙ (W_V·x). FFN: h = W_down·(l_ff ⊙ act(W_up·x)). Params: d_k+d_v+d_ff per layer. LLaMA 7B: 19,200/layer $\\times$ 32 = 614K. Optimizer: ~5MB.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "adapters",
        "ffn",
        "lora",
        "peft"
      ],
      "foundational_papers": [
        {
          "title": "Few-Shot Parameter-Efficient Fine-Tuning is Better and Cheaper than In-Context Learning",
          "authors": "Liu et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2205.05638"
        }
      ],
      "sources": [
        "Liu et al., arXiv:2205.05638"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "imatrix": {
      "id": "imatrix",
      "name": "imatrix / Importance Matrix",
      "expansion": "Importance Matrix for Quantization",
      "category": "quantization-methods",
      "oneliner": "Per-weight importance scores computed from calibration data — used by llama.cpp to weight quantization decisions. Powers IQ-quant methods.",
      "explanation": "An importance matrix records how much each weight contributes to the model's output. Computed by running calibration data through the model and accumulating squared activation magnitudes per weight position. Weights with high importance scores get quantized more carefully (in IQ methods, they get mapped to closer codebook entries). Importance matrices are what distinguish IQ-quants from Q-quants in llama.cpp — same bitwidth, but IQ uses importance data to make smarter rounding decisions.",
      "fundamentals": "For each layer with input X: importance[i,j] $\\propto\\Sigma$_samples (X[:,j])^2. This approximates the diagonal of the Hessian — weights multiplied by large activations matter more. The imatrix is stored as a file and loaded during quantization. IQ-quant codebook assignment is weighted by these scores.",
      "seen_in": [
        "gguf-filename",
        "quantization-scripts"
      ],
      "related": [
        "calibration-data",
        "iq-quants",
        "quantization",
        "tool-llamacpp"
      ],
      "sources": [
        "llama.cpp documentation and source code, github.com/ggerganov/llama.cpp"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "inference-engine": {
      "id": "inference-engine",
      "name": "Inference Engine",
      "expansion": "Specialized runtime for executing trained LLM weights",
      "category": "scaling-patterns",
      "oneliner": "Purpose-built software for running LLM forward passes at maximum efficiency — 5-20x faster than naive PyTorch via KV cache management, continuous batching, kernel fusion, and quantization support.",
      "explanation": "An inference engine is specialized software for running a trained LLM efficiently in production. These engines exist because serving many concurrent users with autoregressive decoding is fundamentally different from training. They optimize through KV cache management, continuous batching to keep GPUs busy, kernel fusion to reduce memory round-trips, and quantization support for smaller formats. Popular examples include vLLM, llama.cpp, TensorRT-LLM, and TGI.",
      "fundamentals": "Core bottleneck: memory bandwidth, not compute. Each decode step reads ALL weights from GPU memory for ONE token. 70B in fp16 = 140 GB streaming through memory bus per token. A100 at 2 TB/s = ~70ms minimum. Engines attack this: quantization shrinks bytes read, KV caching avoids re-reading past context, batching amortizes the weight-read across many sequences. Two phases: prefill (processing prompt, compute-bound, parallel) vs decode (generating tokens, memory-bound, sequential). Good engines treat them differently — chunked prefill avoids blocking decode for other requests. Throughput vs latency tradeoff: larger batches improve throughput but increase per-request latency. Speculative decoding improves latency without sacrificing throughput.",
      "seen_in": [
        "serving-config",
        "deployment-docs",
        "model-cards"
      ],
      "related": [
        "continuous-batching",
        "fp16",
        "gguf",
        "inference",
        "kv-cache",
        "paged-attention",
        "quantization",
        "speculative-decoding",
        "token",
        "tool-llamacpp",
        "tool-tensorrt-llm",
        "tool-tgi",
        "tool-vllm",
        "tp-pp"
      ],
      "foundational_papers": [
        {
          "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
          "authors": "Kwon et al.",
          "venue": "SOSP 2023",
          "arxiv": "2309.06180"
        },
        {
          "title": "Orca: A Distributed Serving System for Transformer-Based Generative Models",
          "authors": "Yu et al.",
          "venue": "OSDI 2022",
          "arxiv": null
        },
        {
          "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness",
          "authors": "Dao et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2205.14135"
        }
      ],
      "resources": [
        {
          "label": "vLLM GitHub",
          "url": "https://github.com/vllm-project/vllm"
        },
        {
          "label": "llama.cpp GitHub",
          "url": "https://github.com/ggerganov/llama.cpp"
        },
        {
          "label": "TGI GitHub",
          "url": "https://github.com/huggingface/text-generation-inference"
        },
        {
          "label": "TensorRT-LLM GitHub",
          "url": "https://github.com/NVIDIA/TensorRT-LLM"
        }
      ],
      "sources": [
        "Kwon et al., arXiv:2309.06180",
        "Yu et al., OSDI 2022"
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "inference-metrics": {
      "id": "inference-metrics",
      "name": "Inference Metrics",
      "expansion": "TTFT, ITL, TPS — Core LLM Serving Performance Metrics",
      "category": "scaling-patterns",
      "oneliner": "The three numbers that define LLM serving performance: Time to First Token, Inter-Token Latency, and Tokens Per Second.",
      "explanation": "Inference metrics are the standard measurements for evaluating LLM serving performance. Time to First Token (TTFT) measures how long a user waits before seeing any output, reflecting prefill speed and queuing delays. Inter-Token Latency (ITL) measures the average gap between consecutive output tokens, governing how smooth a streaming response feels. Tokens Per Second (TPS) measures aggregate throughput across all concurrent requests.",
      "fundamentals": "TTFT = time from request arrival to first generated token. Dominated by prefill compute ($O(n \\cdot P)$ where $n$ = input length, $P$ = params) plus any queuing delay. ITL = time between consecutive decode tokens. Dominated by memory bandwidth ($P \\cdot \\text{bytes\\_per\\_param} / \\text{bandwidth}$). TPS = total tokens generated per second across all requests in a batch. For batch size $B$: TPS $\\approx B / \\text{ITL}$ (amortized). Key insight: TTFT is compute-bound, ITL is bandwidth-bound.",
      "related": [
        "inference",
        "inference-engine",
        "continuous-batching",
        "speculative-decoding"
      ],
      "seen_in": [
        "documentation",
        "serving-config"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "BentoML — LLM Inference Metrics",
          "url": "https://bentoml.com/llm/inference-optimization/llm-inference-metrics"
        },
        {
          "label": "Anyscale — LLM Performance Metrics",
          "url": "https://www.anyscale.com/blog/llm-performance-metrics"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "inference": {
      "id": "inference",
      "name": "Inference",
      "expansion": "Model Inference (forward pass for prediction)",
      "category": "scaling-patterns",
      "oneliner": "Running a trained model to produce outputs — the phase where latency, throughput, and memory bandwidth become the dominant constraints.",
      "explanation": "Inference is running a trained model to produce output from a given input. For autoregressive LLMs, it has two phases: prefill processes the full prompt in parallel and is compute-bound, while decode generates tokens one at a time and is memory-bandwidth-bound. This asymmetry explains why serving optimizations like KV-cache quantization, speculative decoding, and batching matter so much.",
      "fundamentals": "During prefill, cost $\\approx 2nP$ FLOPs where $n$ = input tokens, $P$ = parameters. During decode, each step costs $\\approx 2P$ FLOPs but must load all $P$ parameters from HBM. Arithmetic intensity during decode is $\\approx 1/\\text{bytes\\_per\\_param}$, well below GPU compute-to-bandwidth ratio, making decode firmly memory-bandwidth-bound. Batching $B$ requests raises intensity to $B/\\text{bytes\\_per\\_param}$.",
      "related": [
        "inference-engine",
        "kv-cache",
        "quantization",
        "speculative-decoding"
      ],
      "seen_in": [
        "documentation",
        "model-config"
      ],
      "foundational_papers": [
        {
          "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
          "authors": "Kwon et al.",
          "venue": "SOSP 2023",
          "arxiv": "2309.06180"
        }
      ],
      "resources": [
        {
          "label": "Kipply — Transformer Inference Arithmetic",
          "url": "https://kipp.ly/transformer-inference-arithmetic/"
        },
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "inspect-eval": {
      "id": "inspect-eval",
      "name": "Inspect",
      "expansion": "Inspect — LLM Evaluation Framework (UK AISI)",
      "category": "serving-tools",
      "oneliner": "An open-source framework from the UK AI Safety Institute for building reproducible, composable LLM evaluations with built-in sandboxing.",
      "explanation": "Inspect is an evaluation framework developed by the UK AI Safety Institute for rigorously testing language model capabilities and safety properties. It provides a Python API for defining evaluation tasks as composable pipelines of solvers, scorers, and datasets. Inspect supports sandboxed code execution, multi-turn agent evaluations, and tool-use tasks out of the box. It has become a standard tool for frontier model safety evaluations and is used by several AI labs for pre-deployment assessment.",
      "fundamentals": "Core abstractions: Task (dataset + solver + scorer), Solver (the strategy for generating answers, e.g., chain-of-thought, tool-use agent), Scorer (grading function: exact match, model-graded, code execution). Sandboxing via Docker containers for safe code execution. Built-in support for OpenAI, Anthropic, Google, and local model APIs. Evaluation logs stored as structured JSON for reproducibility.",
      "related": [
        "evals",
        "mmlu",
        "humaneval"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Inspect documentation",
          "url": "https://inspect.ai-safety-institute.org.uk/"
        },
        {
          "label": "Inspect GitHub",
          "url": "https://github.com/UKGovernmentBEIS/inspect_ai"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "int4": {
      "id": "int4",
      "name": "int4 / i4",
      "expansion": "4-bit Integer Quantization",
      "category": "precision-formats",
      "oneliner": "Just 16 discrete levels per value — 8$\\times$ compression from fp32. Makes 30B+ models runnable on consumer GPUs.",
      "explanation": "int4 is a 4-bit integer precision format that stores each weight using only four bits, cutting model memory by roughly four times compared to 16-bit formats. Naive 4-bit rounding destroys quality, so per-group quantization is essential: weights are divided into small groups of 32 to 128, each with its own scale factor. The overhead is minimal at about 0.125 extra bits per weight. int4 is the standard precision for running large models on consumer GPUs.",
      "fundamentals": "Signed: -8 to 7 (two's complement in 4 bits). Two values packed per byte. Per-group storage (g=128): 128$\\times$4 bits + 16 bits (fp16 scale) + optional 4 bits (zero-point) = 4.125 effective bpw. Step size for range [-1,1]: 2/15 $\\approx$ 0.133 — 17$\\times$ coarser than int8.",
      "seen_in": [
        "model-name",
        "filename"
      ],
      "related": [
        "awq",
        "fp16",
        "gptq",
        "int8",
        "nf4",
        "per-group-quantization",
        "quantization",
        "w4a16"
      ],
      "sources": [
        "Frantar et al., 'GPTQ,' ICLR 2023, arXiv:2210.17323",
        "Lin et al., 'AWQ,' MLSys 2024, arXiv:2306.00978"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "GPTQ paper (foundational for int4 LLM use)",
          "url": "https://arxiv.org/abs/2210.17323"
        }
      ]
    },
    "int8": {
      "id": "int8",
      "name": "int8 / i8",
      "expansion": "8-bit Integer Quantization",
      "category": "precision-formats",
      "oneliner": "Fixed-point 8-bit: maps floats to 256 discrete levels via scale+zero-point. 4$\\times$ compression from fp32, runs on all modern hardware.",
      "explanation": "int8 is an 8-bit integer precision format that represents values using 256 discrete levels from negative 128 to positive 127. Integer math units are simpler and more power-efficient than floating-point ones and are available on virtually all modern hardware. The main challenge for language models is that activations can have extreme outlier values in certain channels. Solutions like per-channel scaling, mixed-precision decomposition, and SmoothQuant address this effectively.",
      "fundamentals": "Quantize: x_q = round(x / scale) + zero_point, clamped to [-128, 127]. Dequantize: x $\\approx$ (x_q - zero_point) $\\times$ scale. Symmetric (common for weights): zero_point=0, scale = max(|x|)/127. Step size = scale. For weights in [-1, 1]: step $\\approx$ 0.0078 — values closer than ~0.008 become identical.",
      "seen_in": [
        "model-name",
        "filename",
        "serving-config"
      ],
      "related": [
        "int4",
        "w8a8"
      ],
      "sources": [
        "Dettmers et al., 'LLM.int8(),' NeurIPS 2022, arXiv:2208.07339",
        "Xiao et al., 'SmoothQuant,' ICML 2023, arXiv:2211.10438"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale",
          "authors": "Dettmers et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2208.07339"
        },
        {
          "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization for LLMs",
          "authors": "Xiao et al.",
          "venue": "ICML 2023",
          "arxiv": "2211.10438"
        }
      ]
    },
    "iq-quants": {
      "id": "iq-quants",
      "name": "IQ-Quants (IQ1 through IQ4)",
      "expansion": "Importance-weighted Quantization (codebook-based)",
      "category": "formats",
      "oneliner": "llama.cpp's codebook/lookup-table quantization — uses importance matrices to make smarter rounding decisions. Better than K-quants at same bitwidth.",
      "explanation": "IQ-quants are a family of quantization formats in llama.cpp that use vector quantization with learned codebooks instead of simple scalar rounding. An importance matrix guides which weight values receive the most protection during compression. They deliver higher quality than the older K-quant formats at the same file size, with the biggest advantage at very low bitwidths like IQ2 and IQ3. Generation is slower than K-quants but inference speed is comparable.",
      "fundamentals": "Codebook approach: a fixed set of representative vectors (codebook entries). Each group of weights is mapped to the nearest codebook entry. With importance weighting: minimize $\\Sigma$ importance[i] $\\times$ (w[i] - codebook[nearest][i])^2 instead of unweighted MSE. IQ4_XS at ~4.25 bpw outperforms Q4_K_S at ~4.5 bpw.",
      "seen_in": [
        "gguf-filename"
      ],
      "related": [
        "gguf",
        "imatrix",
        "inference",
        "k-quants",
        "quantization",
        "tool-llamacpp"
      ],
      "sources": [
        "llama.cpp source code and documentation"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "llama.cpp PR #4773 (IQ quants)",
          "url": "https://github.com/ggerganov/llama.cpp/pull/4773"
        },
        {
          "label": "llama.cpp quantization docs",
          "url": "https://github.com/ggerganov/llama.cpp/blob/master/examples/quantize/README.md"
        }
      ]
    },
    "jailbreak": {
      "id": "jailbreak",
      "name": "Jailbreak",
      "expansion": "Jailbreak (LLM Safety Bypass)",
      "category": "safety-alignment",
      "oneliner": "A prompt attack that tricks a safety-trained model into producing content it was aligned to refuse — through roleplay, encoding tricks, or multi-step manipulation.",
      "explanation": "A jailbreak is a specific form of prompt injection that circumvents a model's safety training to produce harmful, unethical, or restricted content. Common techniques include: roleplay scenarios that put the model in a character who ignores rules, encoding harmful requests in base64 or other formats the model can decode, multi-turn escalation that gradually shifts the conversation boundary, and adversarial suffixes found by gradient-based search.",
      "related": [
        "prompt-injection",
        "red-teaming",
        "guardrails",
        "constitutional-ai"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "OWASP LLM Top 10",
          "url": "https://genai.owasp.org/llmrisk/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "k-quants": {
      "id": "k-quants",
      "name": "K-Quants (Q2_K through Q8_K)",
      "expansion": "K-quant block quantization system",
      "category": "formats",
      "oneliner": "llama.cpp's standard quantization types — hierarchical super-blocks of 256 elements with mixed-precision sub-block scales.",
      "explanation": "K-quants are llama.cpp's standard family of quantization types, organized into levels from Q2_K through Q8_K. Each type uses hierarchical super-blocks of 256 elements divided into sub-blocks, each with its own quantized scale and minimum. The letter suffixes S, M, and L indicate how many bits are spent on scale precision versus data. Q4_K_M is the most popular default, offering a good balance of quality and compression for local inference.",
      "fundamentals": "Q4_K_M example: super-block of 256 values, split into 8 sub-blocks of 32. Each sub-block: 32 $\\times$ 4-bit values + 6-bit scale + 6-bit min. Super-block has fp16 master scale and fp16 master min. Effective bpw: 4 + (6+6)/32 + (16+16)/256 $\\approx$ 4.5 bpw. S tier: fewer scale bits, smaller. M tier: balanced. L tier: more scale bits, larger/better.",
      "seen_in": [
        "gguf-filename"
      ],
      "related": [
        "fp16",
        "gguf",
        "inference",
        "iq-quants",
        "quantization",
        "tool-llamacpp",
        "unsloth-dynamic"
      ],
      "sources": [
        "llama.cpp PR #1684 by Iwan Kawrakow"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "llama.cpp PR #1684 by Iwan Kawrakow (K-quants)",
          "url": "https://github.com/ggerganov/llama.cpp/pull/1684"
        }
      ]
    },
    "knowledge-distillation": {
      "id": "knowledge-distillation",
      "name": "Knowledge Distillation",
      "expansion": "Knowledge Distillation (KD)",
      "category": "scaling-patterns",
      "oneliner": "Train a small 'student' to mimic a large 'teacher's output distributions — transfers 'dark knowledge' from soft probabilities that hard labels miss.",
      "explanation": "Knowledge distillation trains a smaller student model to mimic a larger teacher model. The teacher's soft probability distribution contains richer information than hard labels alone, revealing inter-class relationships the student can learn from. Modern LLM distillation often uses synthetic data generated by the teacher rather than direct logit matching. Typical result is 85-95 percent of teacher quality at one-tenth the size, as seen in LLaMA 3.2 and Phi models.",
      "fundamentals": "L = $\\alpha$·CE(y_true, $\\sigma$(z_s)) + (1-$\\alpha$)·$T^2$·KL($\\sigma$(z_t/T) || $\\sigma$(z_s/T)). Temperature T softens distributions. $T^2$ compensates gradient magnitude. Hard labels: ~15 bits/example (lo$g_2$(32K)). Soft labels: 32K continuous values — orders of magnitude more info. Distillation vs pruning: pruning removes params from existing model (architecture shrinks); distillation trains new smaller model from scratch using teacher signal. Can combine: prune then distill to recover (LLaMA 3.2 approach).",
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "related": [
        "dense-models",
        "quantization"
      ],
      "sources": [
        "Hinton et al., 'Distilling the Knowledge in a Neural Network,' 2015, arXiv:1503.02531",
        "Meta, 'The Llama 3 Herd of Models,' arXiv:2407.21783"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Distilling the Knowledge in a Neural Network",
          "authors": "Hinton et al.",
          "venue": "NeurIPS Workshop 2015",
          "arxiv": "1503.02531"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ]
    },
    "kto": {
      "id": "kto",
      "name": "KTO",
      "expansion": "Kahneman-Tversky Optimization",
      "category": "training-pipeline",
      "oneliner": "Only needs thumbs-up/down per response (not paired preferences) — grounded in prospect theory's loss aversion.",
      "explanation": "KTO is an alignment training method that learns from simple binary feedback, where each response is independently labeled as good or bad rather than compared in pairs. It draws on prospect theory from behavioral economics, penalizing bad outputs more heavily than it rewards good ones at roughly a two-to-one ratio. This makes it practical when paired preference rankings are expensive to collect. It requires a reference model to anchor the optimization.",
      "fundamentals": "Desirable: v = $\\sigma$($\\beta$·log($\\pi_\\theta$/$\\pi_{\\text{ref}}$) - z_ref). Undesirable: v = $\\sigma$(z_ref - $\\beta$·log($\\pi_\\theta$/$\\pi_{\\text{ref}}$)). Loss: -E_w[$\\lambda$_w·v(y_w)] - E_l[$\\lambda$_l·v(y_l)], $\\lambda$_l > $\\lambda$_w. Data: {prompt, response, label: bool}.",
      "seen_in": [
        "model-cards"
      ],
      "related": [
        "dpo",
        "rlhf"
      ],
      "foundational_papers": [
        {
          "title": "KTO: Model Alignment as Prospect Theoretic Optimization",
          "authors": "Ethayarajh et al.",
          "venue": "ICML 2024",
          "arxiv": "2402.01306"
        },
        {
          "title": "Prospect Theory: An Analysis of Decision under Risk",
          "authors": "Kahneman & Tversky",
          "venue": "Econometrica 1979",
          "arxiv": null
        }
      ],
      "sources": [
        "Ethayarajh et al., arXiv:2402.01306"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "HuggingFace TRL docs",
          "url": "https://huggingface.co/docs/trl/en/index"
        }
      ]
    },
    "kv-cache-quantization": {
      "id": "kv-cache-quantization",
      "name": "KV Cache Quantization",
      "expansion": "Key-Value Cache Quantization",
      "category": "quantization-methods",
      "oneliner": "Quantize the KV cache tensors themselves (not model weights) to FP8, INT4, or even 2-bit, slashing memory so more requests or longer contexts fit in GPU RAM.",
      "explanation": "KV cache quantization reduces the precision of stored key and value vectors in the attention cache, independently of any weight quantization. Because the KV cache can exceed the size of the model weights at long contexts and large batches, compressing it to FP8 or INT4 directly increases the number of concurrent requests a server can handle.",
      "fundamentals": "KV cache memory: $2 \\times n_{\\text{layers}} \\times n_{\\text{kv\\_heads}} \\times d_{\\text{head}} \\times \\text{seq\\_len} \\times \\text{batch} \\times \\text{bytes}$. Quantising from FP16 (2 B) to FP8 (1 B) halves cache; to INT4 (0.5 B) quarters it. KIVI: keys quantised per-channel (outliers concentrate on specific channels), values per-token. 2-bit asymmetric quantisation retains >98% of FP16 quality on Llama-2-7B. Residual length: keep last $r$ tokens in full precision to preserve recent context fidelity. Compatible with PagedAttention: quantise when writing blocks, dequantise on read.",
      "related": [
        "kv-cache",
        "fp8",
        "int4",
        "quantization"
      ],
      "seen_in": [
        "serving-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache",
          "authors": "Liu et al.",
          "venue": "ICML 2024",
          "arxiv": "2402.02750"
        },
        {
          "title": "KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization",
          "authors": "Hooper et al.",
          "venue": "NeurIPS 2024",
          "arxiv": "2401.18079"
        }
      ],
      "resources": [
        {
          "label": "KIVI paper",
          "url": "https://arxiv.org/abs/2402.02750"
        },
        {
          "label": "KVQuant paper",
          "url": "https://arxiv.org/abs/2401.18079"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "kv-cache": {
      "id": "kv-cache",
      "name": "KV Cache",
      "expansion": "Key-Value Cache",
      "category": "attention-variants",
      "oneliner": "Stores computed K and V tensors from all previous tokens during autoregressive generation — avoids recomputation but is THE memory bottleneck for LLM serving.",
      "explanation": "The KV cache stores key and value vectors from all previous tokens so they are not recomputed at each generation step. Without it, generating each new token would require reprocessing the entire sequence from scratch. The cache grows with sequence length, layer count, and number of KV heads, and can exceed the model weights themselves at long contexts. This memory pressure motivated techniques like GQA, MQA, PagedAttention, and sliding window attention.",
      "fundamentals": "Formula: KV_cache_bytes = 2 $\\times$ n_layers $\\times$ n_kv_heads $\\times$ head_dim $\\times$ seq_len $\\times$ batch_size $\\times$ bytes_per_param. Examples (fp16): Llama 2 7B (32 layers, 32 KV heads): 16 KB/token/layer, 2 GB at 4K context. Llama 2 70B (80 layers, 8 KV heads): 4 KB/token/layer, 1.25 GB at 4K. At batch=32, seq=4096: Llama 70B = 40 GB cache (28% of weight memory). At batch=128, seq=8192: 320 GB — exceeds model weights. Each decode step reads entire cache from HBM — this bandwidth is the bottleneck.",
      "seen_in": [
        "code",
        "serving-config",
        "documentation"
      ],
      "related": [
        "fp16",
        "gqa",
        "mha",
        "mla",
        "mqa",
        "paged-attention",
        "swa",
        "token"
      ],
      "sources": [
        "Pope et al., 'Efficiently Scaling Transformer Inference,' MLSys 2023, arXiv:2211.05102"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Efficiently Scaling Transformer Inference",
          "authors": "Pope et al.",
          "venue": "MLSys 2023",
          "arxiv": "2211.05102"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        },
        {
          "label": "Jay Alammar — The Illustrated GPT-2",
          "url": "https://jalammar.github.io/illustrated-gpt2/"
        }
      ]
    },
    "latency": {
      "id": "latency",
      "name": "Latency",
      "expansion": "LLM Inference Latency",
      "category": "scaling-patterns",
      "oneliner": "The time from sending a request to receiving the complete response — the user-facing performance metric that determines whether an LLM application feels responsive.",
      "explanation": "Latency in LLM serving is the total time a user waits for a response. It breaks down into time-to-first-token (how long before streaming starts), inter-token latency (the gap between streamed tokens), and total generation time. Latency is affected by model size, quantization level, batch size, prompt length, and hardware. For interactive applications, TTFT under 500ms and ITL under 50ms are typical targets.",
      "related": [
        "inference-metrics",
        "inference",
        "speculative-decoding",
        "prefix-caching"
      ],
      "seen_in": [
        "documentation",
        "serving-config"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "layernorm": {
      "id": "layernorm",
      "name": "LayerNorm",
      "expansion": "Layer Normalization",
      "category": "layer-types",
      "oneliner": "Standardizes activations across the hidden dimension per token — zero mean, unit variance, then learnable scale and shift. The original transformer normalization.",
      "explanation": "Layer normalization stabilizes training by normalizing activations across the hidden dimension of each token independently. It computes the mean and variance over all hidden dimensions, normalizes to zero mean and unit variance, then applies a learnable scale and shift. LayerNorm was the default in the original Transformer, GPT-2, and BERT. Post-2022 decoder LLMs have largely replaced it with RMSNorm, which is faster and performs equivalently.",
      "fundamentals": "$\\text{LayerNorm}(x) = \\gamma \\cdot \\frac{x - \\mu}{\\sqrt{\\sigma^2 + \\varepsilon}} + \\beta$. Where $\\mu = \\frac{1}{d}\\sum_i x_i$ and $\\sigma^2 = \\frac{1}{d}\\sum_i (x_i - \\mu)^2$. $\\gamma, \\beta \\in \\mathbb{R}^d$ are learnable (init: $\\gamma=1, \\beta=0$). Params: $2 \\times d_{\\text{model}}$. Requires 2 reductions (mean, variance) vs RMSNorm's 1 (sum of squares). Batch-independent (unlike BatchNorm), no running stats at inference.",
      "seen_in": [
        "model-config",
        "code"
      ],
      "related": [
        "inference",
        "pre-norm",
        "rmsnorm",
        "token"
      ],
      "sources": [
        "Ba et al., 'Layer Normalization,' 2016, arXiv:1607.06450"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Layer Normalization",
          "authors": "Ba et al.",
          "venue": "2016",
          "arxiv": "1607.06450"
        }
      ]
    },
    "learned-pe": {
      "id": "learned-pe",
      "name": "Learned Position Embeddings",
      "expansion": "Learned (Absolute) Position Embeddings",
      "category": "position-encodings",
      "oneliner": "A trainable lookup table of position vectors (like a vocabulary for positions) — simple but has a hard max length limit. Used by GPT-2, BERT, GPT-3.",
      "explanation": "Learned positional embeddings use a trainable lookup table where each position index maps to a learned vector added to the token embedding. The model discovers whatever positional patterns are useful during training. The key limitation is that it cannot handle sequences longer than the maximum position trained. This approach has been replaced by RoPE in modern LLMs but remains common in BERT variants and Vision Transformers.",
      "fundamentals": "x'_m = x_m + W_pos[m]. W_pos trained via backpropagation. GPT-2: L_max=1024, d=768 → 786K params. For 128K context: would be 537M params — impractical. Learned embeddings often show: low-freq sinusoidal patterns in principal components, nearby positions more similar, boundary effects at first/last positions. Cannot extrapolate — position L_max+1 has no representation.",
      "seen_in": [
        "model-config",
        "model-weights"
      ],
      "related": [
        "alibi",
        "rope",
        "sinusoidal-pe",
        "token"
      ],
      "sources": [
        "Radford et al., 'Language Models are Unsupervised Multitask Learners,' 2019 (GPT-2)",
        "Devlin et al., 'BERT,' NAACL 2019, arXiv:1810.04805"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "GPT-2 paper (Radford et al., 2019)",
          "url": "https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf"
        },
        {
          "label": "BERT paper",
          "url": "https://arxiv.org/abs/1810.04805"
        }
      ]
    },
    "license-types": {
      "id": "license-types",
      "name": "License Types",
      "expansion": "Common LLM Licenses",
      "category": "model-naming",
      "oneliner": "The legal terms governing how you can use a model — Apache 2.0 (fully permissive), MIT, Llama Community License (restrictions), and CC-BY-NC (non-commercial only).",
      "explanation": "LLM licenses determine what you can legally do with model weights. Apache 2.0 and MIT are fully permissive — use for anything including commercial products. The Llama Community License allows commercial use but adds restrictions on large-scale deployment and requires attribution. CC-BY-NC prohibits commercial use entirely. Some models use custom licenses with specific restrictions on use cases or output. Always check the license before deploying a model commercially.",
      "related": [
        "open-weights",
        "model-card",
        "huggingface-hub"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Model licenses",
          "url": "https://huggingface.co/docs/hub/repositories-licenses"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "linear-attention": {
      "id": "linear-attention",
      "name": "Linear Attention",
      "expansion": "Linear Attention (Kernel-Based Attention)",
      "category": "attention-variants",
      "oneliner": "A family of attention variants that replace softmax with linear kernels to achieve O(n) complexity instead of O(n²), enabling much longer sequences.",
      "explanation": "Linear attention replaces the softmax in standard attention with a kernel function that allows the computation to be rearranged from O(n²) to O(n) in sequence length. Instead of computing the full n-by-n attention matrix, it factorizes the computation so keys and values are aggregated first, then multiplied by each query. This enables processing very long sequences with constant memory per token.",
      "related": [
        "self-attention",
        "flash-attention",
        "mamba",
        "swa"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention",
          "authors": "Katharopoulos et al.",
          "venue": "ICML 2020",
          "arxiv": "2006.16236"
        }
      ],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "llm-as-judge": {
      "id": "llm-as-judge",
      "name": "LLM-as-a-Judge",
      "expansion": "LLM-as-a-Judge (Model-Graded Evaluation)",
      "category": "scaling-patterns",
      "oneliner": "Using a language model to score or rank outputs from another model, replacing or supplementing human evaluation for scalability.",
      "explanation": "LLM-as-a-Judge is an evaluation paradigm where a strong language model (typically GPT-4 or Claude) scores the quality of outputs from another model. The judge receives a prompt, a candidate response, and evaluation criteria, then assigns a score or ranking. This approach scales evaluation to thousands of examples at low cost. It powers benchmarks like AlpacaEval and MT-Bench and is widely used in production for monitoring output quality.",
      "related": [
        "evals",
        "mmlu",
        "humaneval",
        "rlaif"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
          "authors": "Zheng et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2306.05685"
        }
      ],
      "resources": [
        {
          "label": "MT-Bench paper",
          "url": "https://arxiv.org/abs/2306.05685"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "llmops": {
      "id": "llmops",
      "name": "LLMOps",
      "expansion": "LLM Operations",
      "category": "scaling-patterns",
      "oneliner": "The operational discipline for managing LLMs in production — prompt versioning, evaluation pipelines, cost tracking, monitoring, and deployment automation.",
      "explanation": "LLMOps is the practice of managing the full lifecycle of LLM-powered applications in production. It extends traditional MLOps with LLM-specific concerns: prompt version control and testing, evaluation pipelines that run automatically on prompt changes, cost monitoring per request, latency tracking, hallucination detection, and A/B testing of model versions. Tools in this space include LangSmith, LangFuse, Weights & Biases, and Braintrust.",
      "related": [
        "inference-metrics",
        "evals",
        "tool-langchain",
        "guardrails"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "LangSmith",
          "url": "https://smith.langchain.com"
        },
        {
          "label": "LangFuse",
          "url": "https://langfuse.com"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "lora": {
      "id": "lora",
      "name": "LoRA",
      "expansion": "Low-Rank Adaptation of Large Language Models",
      "category": "fine-tuning-methods",
      "oneliner": "Freeze base weights, add trainable low-rank bypass $W_0$ + ($\\alpha$/r)·B·A. Typically 0.1-0.6% of params. Zero inference overhead after merging.",
      "explanation": "Low-Rank Adaptation freezes all original model weights and injects small trainable low-rank matrices alongside them, typically training under one percent of total parameters. The insight is that fine-tuning updates naturally have low intrinsic rank, so two small matrices can capture the needed changes. After training, the adapters merge back into the base weights with zero inference overhead. LoRA is the dominant parameter-efficient fine-tuning method today.",
      "fundamentals": "Forward: $h = W_0 x + (\\alpha/r) \\cdot B \\cdot A \\cdot x$. LLaMA 7B $r$=16 all linear: ~40M params (0.6%). Memory: ~14.8GB (vs ~84GB full FT). Merge: one-time matrix add $W = W_0 + (\\alpha/r) \\cdot BA$. Files: adapter_model.safetensors (~80MB) + adapter_config.json.",
      "seen_in": [
        "model-name",
        "model-config",
        "filename"
      ],
      "related": [
        "adapters",
        "dora",
        "ia3",
        "inference",
        "peft",
        "qlora",
        "safetensors",
        "zero"
      ],
      "foundational_papers": [
        {
          "title": "LoRA: Low-Rank Adaptation of Large Language Models",
          "authors": "Hu et al.",
          "venue": "ICLR 2022",
          "arxiv": "2106.09685"
        },
        {
          "title": "Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning",
          "authors": "Aghajanyan et al.",
          "venue": "ACL 2021",
          "arxiv": "2012.13255"
        },
        {
          "title": "LoRA+: Efficient Low Rank Adaptation of Large Models",
          "authors": "Hayou et al.",
          "venue": "ICML 2024",
          "arxiv": "2402.12354"
        }
      ],
      "sources": [
        "Hu et al., arXiv:2106.09685"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Sebastian Raschka — Practical Tips for LoRA",
          "url": "https://magazine.sebastianraschka.com/p/practical-tips-for-finetuning-llms"
        },
        {
          "label": "Cameron Wolfe — PEFT, LoRA, QLoRA",
          "url": "https://cameronrwolfe.substack.com/p/easily-train-a-specialized-llm-peft"
        },
        {
          "label": "HuggingFace PEFT LoRA docs",
          "url": "https://huggingface.co/docs/peft/en/package_reference/lora"
        }
      ]
    },
    "mamba": {
      "id": "mamba",
      "name": "Mamba / SSM",
      "expansion": "Mamba (Selective State Space Model)",
      "category": "layer-types",
      "oneliner": "A non-transformer architecture that models sequences in linear time using selective state spaces instead of quadratic attention.",
      "explanation": "Mamba is an alternative to the transformer that replaces self-attention with a selective state space mechanism, reducing sequence processing from quadratic to linear time. Instead of attending between every token pair, it processes sequences through a learned recurrence that selectively remembers or forgets information. This makes it faster for long sequences while matching transformer quality. Mamba-based models appear on HuggingFace and have influenced hybrid architectures.",
      "fundamentals": "State space models compute $y = \\text{SSM}(A, B, C)(x)$ where $A \\in \\mathbb{R}^{N \\times N}$ is the state transition matrix, discretized for each timestep. Mamba's key innovation is making $B$, $C$, and $\\Delta$ (discretization step) input-dependent via learned projections, enabling content-based selection. Computation uses a parallel scan algorithm: $O(L \\cdot N \\cdot D)$ time and $O(L \\cdot D)$ memory vs attention's $O(L^2 \\cdot D)$.",
      "related": [
        "self-attention",
        "mha",
        "flash-attention"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
          "authors": "Gu & Dao",
          "venue": "2023",
          "arxiv": "2312.00752"
        }
      ],
      "resources": [
        {
          "label": "Mamba paper",
          "url": "https://arxiv.org/abs/2312.00752"
        },
        {
          "label": "The Annotated Mamba",
          "url": "https://srush.github.io/annotated-mamba/hard.html"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "marlin": {
      "id": "marlin",
      "name": "Marlin",
      "expansion": "Mixed-Precision Auto-Regressive Parallel Inference on GPUs",
      "category": "quantization-methods",
      "oneliner": "Not a quantization method — a GPU kernel that makes GPTQ/AWQ inference fast. Fuses dequantization with matrix multiply.",
      "explanation": "Marlin is a CUDA kernel designed specifically for W4A16 inference. It reads int4 weights from memory, dequantizes in shared memory, and performs the fp16 GEMM in a single fused operation. This avoids materializing the full fp16 weight matrix and maximizes memory bandwidth utilization. Used internally by vLLM for GPTQ and AWQ models. Achieves near-theoretical-maximum memory bandwidth utilization (>90% on A100/H100).",
      "fundamentals": "Key optimizations: 1) Async global→shared memory copy of packed int4 weights. 2) Dequantize in shared memory using scale factors. 3) Feed into Tensor Core WMMA operations. 4) Asynchronous pipelining of load and compute stages. Achieves 4$\\times$ throughput of naive fp16 GEMM for batch size 1.",
      "seen_in": [
        "serving-config",
        "vllm-internals"
      ],
      "related": [
        "awq",
        "fp16",
        "gptq",
        "inference",
        "int4",
        "tool-vllm",
        "w4a16"
      ],
      "sources": [
        "Frantar et al., 'Marlin,' arXiv:2408.11743"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Marlin: Mixed-Precision Auto-Regressive Parallel INference on GPUs",
          "authors": "Frantar et al.",
          "venue": "2024",
          "arxiv": "2408.11743"
        }
      ]
    },
    "mcp": {
      "id": "mcp",
      "name": "MCP",
      "expansion": "Model Context Protocol",
      "category": "agents-and-tools",
      "oneliner": "Anthropic's open protocol for connecting LLMs to external tools and data sources. A universal plug-and-play standard for agent integrations.",
      "explanation": "MCP is an open protocol introduced by Anthropic in November 2024 that standardises how LLM applications discover and invoke external tools, access data sources, and manage context. It defines a client-server architecture where an MCP server exposes tools (callable functions), resources (readable data), and prompts (reusable templates), while an MCP client inside the LLM application connects to one or more servers.",
      "fundamentals": "Architecture: Host (IDE, chat app) embeds an MCP Client that connects to MCP Servers. Each server declares capabilities via initialize handshake. Tools are described with JSON Schema inputs and can return text or structured JSON. Resources provide read-only data (files, DB rows). Prompts are server-defined templates. Transport: stdio for local servers, HTTP+SSE or streamable HTTP for remote. Security: servers are OAuth 2.0 Resource Servers; tokens are scoped per server. The 2025-06-18 spec added structured output, elicitation (server asks user for input mid-session), and task tracking for long-running operations.",
      "related": [
        "agentic-ai",
        "function-calling"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "MCP specification",
          "url": "https://modelcontextprotocol.io/specification/2025-11-25"
        },
        {
          "label": "Anthropic MCP announcement",
          "url": "https://www.anthropic.com/news/model-context-protocol"
        },
        {
          "label": "MCP GitHub repo",
          "url": "https://github.com/modelcontextprotocol/modelcontextprotocol"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "metr": {
      "id": "metr",
      "name": "METR",
      "expansion": "METR — Model Evaluation & Threat Research",
      "category": "scaling-patterns",
      "oneliner": "An organization that designs task-based evaluations measuring whether AI models can autonomously perform dangerous capabilities like cyberattacks or self-replication.",
      "explanation": "METR is a nonprofit research organization that develops evaluations for measuring dangerous autonomous capabilities in AI models. Their task suites test whether a model can accomplish multi-step real-world goals like writing exploits, conducting research, or acquiring resources without human guidance. METR evaluations are used by frontier labs including OpenAI and Anthropic as part of pre-deployment safety assessments.",
      "fundamentals": "METR tasks are agent-based: the model operates in a sandboxed environment with a shell, browser, and file system, attempting to complete a goal autonomously. Scoring is binary (task completed or not) with time limits. Task categories include: software engineering, cybersecurity, research, and resource acquisition. Results inform responsible scaling frameworks — if a model passes certain capability thresholds, labs agree to implement additional safeguards before deployment.",
      "related": [
        "evals",
        "red-teaming",
        "agentic-ai"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "METR website",
          "url": "https://metr.org"
        },
        {
          "label": "METR task suite",
          "url": "https://github.com/METR/task-standard"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "mha": {
      "id": "mha",
      "name": "MHA",
      "expansion": "Multi-Head Attention",
      "category": "attention-variants",
      "oneliner": "The original transformer attention — each head independently computes Q, K, V over the full sequence, then results are concatenated.",
      "explanation": "Multi-head attention is the original attention mechanism from the Transformer architecture. Input embeddings are projected into queries, keys, and values per head, where each head operates on a lower-dimensional slice. Heads compute scaled dot-product attention independently, then results are concatenated and projected back. Different heads can specialize in syntactic, semantic, or positional patterns. MHA is the baseline for all attention variants.",
      "fundamentals": "For hidden_dim=4096, n_heads=32, head_dim=128: W_Q, W_K, W_V each [4096, 4096], W_O [4096, 4096]. Q·K^T / $\\sqrt{128}$ → [batch, 32, seq_len, seq_len] attention matrix → softmax → multiply by V → [batch, 32, seq_len, 128] → concat → [batch, seq_len, 4096]. Param count: 4 $\\times$ 409$6^2$ = ~67M per layer. KV cache per token per layer: 2 $\\times$ 32 $\\times$ 128 $\\times$ 2 bytes = 16 KB. For 32 layers at 4096 tokens: ~2 GB. Compute: $O(seq_le$n^2 \\times$ d_model)$ per layer — the quadratic cost.",
      "seen_in": [
        "model-config",
        "architecture-papers"
      ],
      "related": [
        "flash-attention",
        "gqa",
        "kv-cache",
        "mla",
        "mqa",
        "sdpa",
        "token"
      ],
      "sources": [
        "Vaswani et al., 'Attention Is All You Need,' NeurIPS 2017, arXiv:1706.03762"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Attention Is All You Need",
          "authors": "Vaswani et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03762"
        }
      ],
      "resources": [
        {
          "label": "The Illustrated Transformer (Jay Alammar)",
          "url": "https://jalammar.github.io/illustrated-transformer/"
        },
        {
          "label": "3Blue1Brown — Attention in Transformers",
          "url": "https://www.3blue1brown.com/lessons/attention"
        },
        {
          "label": "Sebastian Raschka — LLM Architecture Gallery",
          "url": "https://sebastianraschka.com/llm-architecture-gallery/"
        }
      ]
    },
    "min-p": {
      "id": "min-p",
      "name": "Min-p Sampling",
      "expansion": "Min-p (Minimum Probability) Sampling",
      "category": "sampling-decoding",
      "oneliner": "A dynamic sampling threshold that keeps tokens with probability above a fraction of the top token's probability — an adaptive alternative to fixed top-k or top-p.",
      "explanation": "Min-p sampling retains all tokens whose probability exceeds a fraction of the highest-probability token. If the top token has probability 0.9, min-p of 0.1 keeps only tokens above 0.09. If the top token has probability 0.3, it keeps tokens above 0.03 — naturally widening the candidate pool when the model is uncertain. This adapts more gracefully than top-k (fixed count) or top-p (fixed cumulative mass). Min-p has gained popularity in the llama.",
      "related": [
        "top-p",
        "top-k",
        "temperature"
      ],
      "seen_in": [
        "model-config",
        "code"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "mixed-precision-training": {
      "id": "mixed-precision-training",
      "name": "Mixed-Precision Training",
      "expansion": "Mixed-Precision Training (FP16/BF16 compute + FP32 master weights)",
      "category": "training-pipeline",
      "oneliner": "Training with 16-bit compute for speed while keeping 32-bit master weights for numerical stability — the standard for all modern LLM training.",
      "explanation": "Mixed-precision training runs the forward and backward passes in 16-bit for speed and memory savings, while maintaining a copy of the weights in 32-bit for accurate gradient accumulation. With bf16 on modern hardware it works out of the box; with fp16, loss scaling prevents gradient underflow. This gives nearly 2x speedup and halves activation memory. Every major LLM is trained this way, and it is the default in DeepSpeed, FSDP, and Accelerate.",
      "fundamentals": "Forward: compute in fp16/bf16. Backward: gradients in fp16/bf16. Optimizer step: accumulate gradients into fp32 master weights using Adam (momentum and variance stored in fp32). Memory per param: 2 bytes (fp16 weights) + 2 bytes (fp16 grads) + 4 bytes (fp32 master) + 8 bytes (Adam m + v in fp32) = 16 bytes total. Loss scaling (fp16 only): multiply loss by scale $S$ before backward, divide gradients by $S$ before optimizer step.",
      "related": [
        "bf16",
        "fp16",
        "fp32",
        "zero",
        "full-ft",
        "tool-accelerate"
      ],
      "seen_in": [
        "training-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Mixed Precision Training",
          "authors": "Micikevicius et al.",
          "venue": "ICLR 2018",
          "arxiv": "1710.03740"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — How to Train Really Large Models",
          "url": "https://lilianweng.github.io/posts/2021-09-25-train-large/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "mla": {
      "id": "mla",
      "name": "MLA",
      "expansion": "Multi-head Latent Attention",
      "category": "attention-variants",
      "oneliner": "DeepSeek's approach: compress K/V into a low-rank latent vector before caching — achieves MQA-level cache size with MHA-level quality.",
      "explanation": "Multi-head Latent Attention compresses key-value representations into a low-dimensional latent space instead of reducing head count like GQA or MQA. Only the compact latent vector is cached per token, and it gets up-projected back to full keys and values at attention time. This achieves over 90 percent KV cache reduction compared to standard multi-head attention while preserving quality. It was developed for and is used in the DeepSeek model family.",
      "fundamentals": "For DeepSeek-V2: d=5120, n_heads=128, head_dim=128, d_c=512 (compression dim), d_R=64 (RoPE dim). Compress: c_KV = x · W_DKV [5120→512]. Cache only c_KV (512 floats) + k_rope (64 floats) per token per layer = 576 $\\times$ 2 = 1,152 bytes. Decompress at attention time: K = c_KV · W_UK [512→128$\\times$128], V = c_KV · W_UV. Final key = [K_content; k_rope] (dim 192). Compare: MHA=65,536 bytes, GQA(8kv)=4,096 bytes, MQA=512 bytes, MLA=1,152 bytes. Tradeoff: adds compute for up-projection but saves bandwidth — net win for memory-bound decoding.",
      "seen_in": [
        "model-config",
        "research-papers"
      ],
      "related": [
        "gqa",
        "kv-cache",
        "mha",
        "mqa",
        "rope",
        "token"
      ],
      "sources": [
        "DeepSeek-AI, 'DeepSeek-V2,' 2024, arXiv:2405.04434"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model",
          "authors": "DeepSeek-AI",
          "venue": "2024",
          "arxiv": "2405.04434"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Raschka — Visual Attention Variants",
          "url": "https://magazine.sebastianraschka.com/p/visual-attention-variants"
        }
      ]
    },
    "mmlu": {
      "id": "mmlu",
      "name": "MMLU",
      "expansion": "Massive Multitask Language Understanding",
      "category": "scaling-patterns",
      "oneliner": "A 57-subject multiple-choice benchmark spanning high school to professional level — the most commonly cited LLM knowledge evaluation.",
      "explanation": "MMLU is a benchmark that tests a language model's knowledge across 57 academic subjects from high school to professional level, using four-way multiple-choice questions. It became the standard way to report general LLM capability, appearing on virtually every model card. Frontier models now score above 88 percent, approaching saturation. Harder successors like MMLU-Pro are replacing it for frontier evaluation, but MMLU remains the most widely cited baseline.",
      "fundamentals": "MMLU uses few-shot evaluation (typically 5-shot): the model sees 5 example question-answer pairs as context, then must answer a new question by selecting A/B/C/D. Scoring is simple accuracy across all 57 subjects. The benchmark has 14,042 questions total. Subjects are grouped into STEM, humanities, social sciences, and other. Per-subject scores reveal capability patterns: a model might score 95% on history but 60% on abstract algebra.",
      "related": [
        "pre-training",
        "inference"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Measuring Massive Multitask Language Understanding",
          "authors": "Hendrycks et al.",
          "venue": "ICLR 2021",
          "arxiv": "2009.03300"
        }
      ],
      "resources": [
        {
          "label": "MMLU on Papers With Code",
          "url": "https://paperswithcode.com/dataset/mmlu"
        },
        {
          "label": "Open LLM Leaderboard",
          "url": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "model-card": {
      "id": "model-card",
      "name": "Model Card",
      "expansion": "Model Card (Model Documentation)",
      "category": "model-naming",
      "oneliner": "A standardized documentation page accompanying every model release — covering architecture, training data, benchmarks, intended use, limitations, and license.",
      "explanation": "A model card is the README and metadata page that accompanies a model release on HuggingFace or other platforms. It documents what the model is, how it was trained, what data was used, benchmark scores, known limitations, intended use cases, and license terms. Google introduced model cards as a concept in 2019 to promote transparency and responsible AI. On HuggingFace, model cards are Markdown files rendered on the model page, often including YAML frontmatter with structured metadata.",
      "related": [
        "huggingface-hub",
        "tag-instruct",
        "evals"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Model Cards for Model Reporting",
          "authors": "Mitchell et al.",
          "venue": "FAT* 2019",
          "arxiv": "1810.03993"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace — Model cards guide",
          "url": "https://huggingface.co/docs/hub/model-cards"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "model-head": {
      "id": "model-head",
      "name": "Model Head",
      "expansion": "Task-Specific Model Head",
      "category": "layer-types",
      "oneliner": "The output layer(s) stacked on top of a pretrained backbone that adapt it to a specific task like classification or text generation.",
      "explanation": "A model head is the task-specific projection layer on top of a shared backbone. In a causal language model, the LM head projects hidden states to vocabulary-sized logits for next-token prediction, often sharing weights with the input embedding table. For classification, a head pools the sequence and projects to class logits instead. Swapping heads lets you reuse the same pretrained backbone for entirely different tasks without retraining it.",
      "fundamentals": "For an LM head with weight tying, logits are $z = h W_E^\\top$ where $h \\in \\mathbb{R}^{d}$ is the hidden state and $W_E \\in \\mathbb{R}^{|V| \\times d}$ is the shared embedding matrix. Weight tying reduces parameters by $|V| \\times d$. A classification head applies $\\text{softmax}(W_c h + b_c)$ where $W_c \\in \\mathbb{R}^{C \\times d}$.",
      "related": [
        "backbone",
        "ffn",
        "token"
      ],
      "seen_in": [
        "code",
        "model-config"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace Model summary",
          "url": "https://huggingface.co/docs/transformers/model_summary"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "model-merging": {
      "id": "model-merging",
      "name": "Model Merging",
      "expansion": "Model Merging / Weight Merging / Model Fusion",
      "category": "scaling-patterns",
      "oneliner": "Combine weights from multiple fine-tuned models (same base) by arithmetic interpolation — no training needed. Popular methods: SLERP, TIES, DARE.",
      "explanation": "Model merging combines the weights of multiple fine-tuned models into a single model without additional training. It works because models fine-tuned from the same base occupy nearby regions in weight space, so interpolating between them can combine their capabilities. Simple averaging works surprisingly well, while methods like TIES and DARE handle sign conflicts and redundancy more carefully. It fails when models lack a shared base or have conflicting specializations.",
      "fundamentals": "Task vector: τ_A = W_A - W_base. Linear: W = W_base + $\\alpha$·τ_A + $\\beta$·τ_B. SLERP: interpolate along great circle on unit hypersphere, per-tensor. TIES: 1) Trim entries below magnitude percentile (e.g., 80th). 2) Elect sign by majority vote across models. 3) Average agreed entries. DARE: 1) Drop entries with probability p (e.g., 0.9). 2) Rescale survivors by 1/(1-p). 3) Merge. Why it works: 'linear mode connectivity' — same-base fine-tunes stay in same loss basin. Breaks with high LR, many epochs, or very different tasks.",
      "seen_in": [
        "model-cards",
        "repo-names"
      ],
      "related": [
        "lora"
      ],
      "sources": [
        "Wortsman et al., 'Model Soups,' ICML 2022",
        "Yadav et al., 'TIES-Merging,' NeurIPS 2023, arXiv:2306.01708",
        "Yu et al., 'DARE,' ICML 2024, arXiv:2311.03099"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Model Soups: Averaging Weights of Multiple Fine-tuned Models",
          "authors": "Wortsman et al.",
          "venue": "ICML 2022",
          "arxiv": "2203.05482"
        },
        {
          "title": "TIES-Merging: Resolving Interference When Merging Models",
          "authors": "Yadav et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2306.01708"
        },
        {
          "title": "Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch (DARE)",
          "authors": "Yu et al.",
          "venue": "ICML 2024",
          "arxiv": "2311.03099"
        }
      ]
    },
    "model-size-memory": {
      "id": "model-size-memory",
      "name": "Model Size ↔ Memory",
      "expansion": "Memory Footprint Calculation for Quantized LLMs",
      "category": "quantization-basics",
      "oneliner": "Memory (bytes) = params $\\times$ bits_per_param / 8, plus overhead for scales, KV-cache, and non-quantized components.",
      "explanation": "Model memory footprint is estimated by multiplying parameter count by bytes per parameter. A 70-billion parameter model in 16-bit needs about 140 GB, while the same model in 4-bit needs roughly 35 GB. Real usage is higher because some layers stay in 16-bit, scale factors add overhead, and the KV cache grows with sequence length and batch size. For a 70B model, KV cache alone can reach 2.5 GB per sequence at 4096 tokens.",
      "fundamentals": "Weight memory = P $\\times$ b / 8. Scale overhead = P $\\times$ (s_bits / g) / 8. KV-cache = 2 $\\times$ layers $\\times$ kv_heads $\\times$ head_dim $\\times$ seq_len $\\times$ batch $\\times$ kv_bytes. Practical: RTX 4090 (24 GB) fits 7B at fp16, 13B at int8, ~33B at int4. A100-80GB fits 70B at int8. H100 at 3.35 TB/s: 70B int4 $\\approx$ 55 tok/s, 70B fp16 $\\approx$ 14 tok/s.",
      "seen_in": [
        "documentation",
        "hardware-guides"
      ],
      "related": [
        "fp16",
        "int4",
        "int8",
        "kv-cache",
        "quantization"
      ],
      "sources": [
        "Pope et al., 'Efficiently Scaling Transformer Inference,' MLSys 2023, arXiv:2211.05102"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Efficiently Scaling Transformer Inference",
          "authors": "Pope et al.",
          "venue": "MLSys 2023",
          "arxiv": "2211.05102"
        }
      ]
    },
    "moe": {
      "id": "moe",
      "name": "MoE",
      "expansion": "Mixture-of-Experts",
      "category": "scaling-patterns",
      "oneliner": "Replace each FFN with N parallel expert FFNs, activate only top-k per token via learned routing. Decouples total params from per-token compute.",
      "explanation": "Mixture-of-Experts is an architecture that replaces each dense feed-forward layer with multiple parallel expert sub-networks and a learned router that selects only a few of them per token. This means a model can have hundreds of billions of total parameters while only using a small fraction for any given token, dramatically reducing compute cost. The tradeoff is that all parameters must still fit in memory. Mixtral 8x7B has 46.7B total but only 12.9B active per token.",
      "fundamentals": "Router: $g(x) = \\text{softmax}(W_g \\cdot x)$, select $\\text{TopK}(g(x), k)$. Output: $y = \\sum_{i \\in S} g(x)_i \\cdot \\text{Expert}_i(x)$. Load balance loss: $L = \\alpha \\cdot N \\cdot \\sum_i (f_i \\cdot p_i)$ where $f_i$=token fraction, $p_i$=mean prob. Capacity factor CF: buffer = $CF \\times (T/N)$ tokens per expert, excess dropped. Mixtral math: 32 layers $\\times$ 8 experts $\\times$ ~134M params/expert = 34.4B FFN + 12.3B attention = 46.7B total. Per token: 2/8 experts $\\to$ 12.9B active. Memory: 46.7B (all loaded). FLOPS: ~13B equivalent.",
      "seen_in": [
        "model-name",
        "model-config"
      ],
      "related": [
        "dense-models",
        "expert-routing",
        "ffn",
        "mxfp4-moe",
        "token"
      ],
      "sources": [
        "Shazeer et al., 'Outrageously Large Neural Networks,' ICLR 2017",
        "Fedus et al., 'Switch Transformers,' JMLR 2022",
        "Jiang et al., 'Mixtral of Experts,' arXiv:2401.04088",
        "DeepSeek-AI, 'DeepSeek-V3,' arXiv:2412.19437"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer",
          "authors": "Shazeer et al.",
          "venue": "ICLR 2017",
          "arxiv": "1701.06538"
        },
        {
          "title": "Switch Transformers: Scaling to Trillion Parameter Models",
          "authors": "Fedus et al.",
          "venue": "JMLR 2022",
          "arxiv": "2101.03961"
        },
        {
          "title": "Mixtral of Experts",
          "authors": "Jiang et al.",
          "venue": "2024",
          "arxiv": "2401.04088"
        }
      ],
      "resources": [
        {
          "label": "Cameron Wolfe — MoE LLMs",
          "url": "https://cameronrwolfe.substack.com/p/moe-llms"
        },
        {
          "label": "Lilian Weng — The Transformer Family v2",
          "url": "https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/"
        }
      ]
    },
    "mqa": {
      "id": "mqa",
      "name": "MQA",
      "expansion": "Multi-Query Attention",
      "category": "attention-variants",
      "oneliner": "All query heads share a single K and V head — reduces KV cache by n_heads$\\times$ (e.g., 32$\\times$) with small quality tradeoff.",
      "explanation": "Multi-Query Attention shares a single key head and value head across all query heads, dramatically shrinking the KV cache. During autoregressive decoding the bottleneck is reading cached keys and values from memory, not computation, so reducing cache size directly improves throughput. MQA shrinks this cache by a factor equal to the number of query heads, with only a small quality loss. It is used in PaLM, Falcon, and StarCoder, and inspired the GQA compromise.",
      "fundamentals": "For hidden_dim=4096, n_query_heads=32, n_kv_heads=1, head_dim=128: W_Q [4096, 4096], W_K [4096, 128] (32$\\times$ smaller), W_V [4096, 128] (32$\\times$ smaller), W_O [4096, 4096]. K and V broadcast to match query heads during attention. KV cache per token per layer: 2 $\\times$ 1 $\\times$ 128 $\\times$ 2 = 512 bytes (vs 16 KB in MHA). For 32 layers at 4096 tokens: ~64 MB (vs ~2 GB). Bandwidth per decode step at seq_len=2048: ~32 MB (vs ~1 GB in MHA).",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "gqa",
        "kv-cache",
        "mha",
        "token"
      ],
      "sources": [
        "Shazeer, 'Fast Transformer Decoding: One Write-Head is All You Need,' 2019, arXiv:1911.02150"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Fast Transformer Decoding: One Write-Head is All You Need",
          "authors": "Shazeer",
          "venue": "2019",
          "arxiv": "1911.02150"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — The Transformer Family v2",
          "url": "https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/"
        }
      ]
    },
    "mt-bench": {
      "id": "mt-bench",
      "name": "MT-Bench",
      "expansion": "MT-Bench — Multi-Turn Benchmark",
      "category": "scaling-patterns",
      "oneliner": "An 80-question multi-turn conversation benchmark scored by GPT-4 across 8 categories — one of the first LLM-as-judge evaluations.",
      "explanation": "MT-Bench evaluates chat model quality through 80 multi-turn questions spanning writing, roleplay, reasoning, math, coding, extraction, STEM, and humanities. A strong judge model (GPT-4) scores each response on a 1-10 scale. The multi-turn format tests whether models can maintain coherence and follow up on previous context. MT-Bench was introduced alongside Chatbot Arena as a complementary automated evaluation and is widely used for quick model comparisons during development.",
      "related": [
        "evals",
        "llm-as-judge",
        "chatbot-arena"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
          "authors": "Zheng et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2306.05685"
        }
      ],
      "resources": [
        {
          "label": "MT-Bench paper",
          "url": "https://arxiv.org/abs/2306.05685"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "mxfp4-moe": {
      "id": "mxfp4-moe",
      "name": "MXFP4_MOE",
      "expansion": "Microscaling FP4 optimized for Mixture-of-Experts layers",
      "category": "formats",
      "oneliner": "E2M1 micro-float with block scaling, specifically targeting MoE expert weights. Only appears in 26B-A4B GGUF repos.",
      "explanation": "MXFP4_MOE is a quantization format that applies the OCP Microscaling FP4 specification to the expert weight matrices inside Mixture-of-Experts models. It uses tiny E2M1 floating-point elements that share a block-level E8M0 scale factor, keeping storage extremely compact. MoE expert weights are a natural target because there are many of them but each is individually small. This is a relatively new format supported in llama.cpp.",
      "fundamentals": "Same as E2M1/MXFP4: 32 elements share one 8-bit scale. Per-element: 1 sign, 2 exp, 1 mantissa = 15 distinct values. Effective 4.25 bpw. The _MOE suffix is an architecture annotation, not a different quantization algorithm.",
      "seen_in": [
        "gguf-filename"
      ],
      "related": [
        "e2m1",
        "gguf",
        "moe",
        "quantization",
        "tool-llamacpp"
      ],
      "sources": [
        "OCP MX Spec v1.0",
        "llama.cpp source code"
      ],
      "confidence": "medium",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "OCP Microscaling (MX) Spec v1.0",
          "url": "https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf"
        }
      ]
    },
    "nf4": {
      "id": "nf4",
      "name": "nf4 / NormalFloat4",
      "expansion": "4-bit NormalFloat — normal-distribution-optimal 4-bit data type",
      "category": "precision-formats",
      "oneliner": "Non-uniform 4-bit format with levels spaced at quantiles of a Gaussian — optimal for neural network weights, used by QLoRA/bitsandbytes.",
      "explanation": "NormalFloat4 is a non-uniform 4-bit data type designed for neural network weights. Since trained weights approximately follow a Gaussian distribution, uniform quantization wastes resolution in sparse tails and under-resolves the dense center. NF4 places its 16 levels at the quantiles of a standard normal distribution so each level covers equal probability mass, minimizing quantization error. It is the default format for bitsandbytes 4-bit loading.",
      "fundamentals": "Lookup table: $\\{-1.0, -0.6962, -0.5251, -0.3949, -0.2844, -0.1848, -0.0911, 0.0, 0.0796, 0.1609, 0.2461, 0.3379, 0.4407, 0.5626, 0.7230, 1.0\\}$. Levels are denser near zero (spacing 0.08) and sparser in tails (spacing 0.28). Quantize: normalize per group $w_{\\text{norm}} = w/\\text{absmax}$, map to nearest level. Dequantize: $\\text{nf4\\_lookup}[\\text{index}] \\times \\text{absmax}$.",
      "seen_in": [
        "model-config",
        "bitsandbytes-config"
      ],
      "related": [
        "bitsandbytes",
        "int4",
        "qlora",
        "quantization"
      ],
      "sources": [
        "Dettmers et al., 'QLoRA,' NeurIPS 2023, arXiv:2305.14314"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "QLoRA: Efficient Finetuning of Quantized Language Models",
          "authors": "Dettmers et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2305.14314"
        }
      ]
    },
    "ntk-rope": {
      "id": "ntk-rope",
      "name": "NTK-aware RoPE Scaling",
      "expansion": "Neural Tangent Kernel-aware Rotary Position Embedding Scaling",
      "category": "position-encodings",
      "oneliner": "Extends RoPE context by increasing base frequency — preserves high-frequency local resolution while stretching low-frequency global capacity. Works zero-shot.",
      "explanation": "NTK-aware RoPE scaling extends a model's context window by increasing the base frequency in a way that treats different frequency bands differently. Unlike position interpolation, which compresses all frequencies equally and damages local resolution, this approach preserves high-frequency dimensions encoding local positions while stretching low-frequency ones for global positions. It requires no fine-tuning and became the basis for more advanced methods like YaRN.",
      "fundamentals": "New base: base' = base · $\\alpha$^(d/(d-2)) where $\\alpha$ = context extension factor. For $\\alpha$=4, d=128: base' $\\approx$ 40,440. Effect on wavelengths: highest-freq band (i=0) unchanged ($\\lambda_0$=2$\\pi$ always), lowest-freq band scales by ~$\\alpha$. Intermediate bands interpolate smoothly. Compare PI: all frequencies compressed uniformly by $\\alpha$ — damages high-freq resolution. NTK preserves local while extending global.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "rope",
        "yarn",
        "position-interpolation",
        "abf"
      ],
      "sources": [
        "/u/bloc97, 'NTK-Aware Scaled RoPE,' Reddit r/LocalLLaMA, June 2023",
        "Formalized in Peng et al., 'YaRN,' arXiv:2309.00071"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Original Reddit post by /u/bloc97 (NTK-aware scaled RoPE)",
          "url": "https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/"
        },
        {
          "label": "Formalized in YaRN paper",
          "url": "https://arxiv.org/abs/2309.00071"
        }
      ]
    },
    "observability": {
      "id": "observability",
      "name": "LLM Observability",
      "expansion": "LLM Observability (Tracing and Monitoring)",
      "category": "scaling-patterns",
      "oneliner": "Tracing every LLM call in production — recording prompts, completions, latency, token counts, and costs — to debug, optimize, and monitor quality.",
      "explanation": "LLM observability is the practice of instrumenting every LLM interaction in a production system so you can see exactly what went in, what came out, how long it took, and how much it cost. Traces capture the full chain: user input, system prompt, retrieved context, model response, and any tool calls. This is essential for debugging agent failures, detecting quality regressions, optimizing prompts, and tracking spend.",
      "related": [
        "llmops",
        "inference-metrics",
        "evals"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "LangFuse docs",
          "url": "https://langfuse.com/docs"
        },
        {
          "label": "Arize Phoenix",
          "url": "https://github.com/Arize-ai/phoenix"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "olmo": {
      "id": "olmo",
      "name": "OLMo",
      "expansion": "OLMo — Open Language Model (AI2)",
      "category": "hf-organizations",
      "oneliner": "AI2's fully open foundation model that publishes not just weights but the complete training data, code, intermediate checkpoints, and evaluation suite.",
      "explanation": "OLMo is a family of language models from the Allen Institute for AI (AI2) designed to be the most open foundation model project. Unlike most open-weight models that release only final weights, OLMo publishes the complete training code, the full training dataset (Dolma), all intermediate checkpoints, training logs, and evaluation results. This radical openness enables the research community to study training dynamics, reproduce results, and build on the work.",
      "fundamentals": "Dolma dataset: ~3 trillion tokens from Common Crawl, Wikipedia, books, code, academic papers. Architecture: standard decoder-only transformer with SwiGLU, RoPE, GQA, RMSNorm. Training: AdamW, cosine LR schedule, context length 4096. Released: full training code (PyTorch + Composer), Dolma dataset, ~500 intermediate checkpoints per run, Weights & Biases training logs, evaluation framework (Catwalk). License: Apache 2.0 for code, ODC-By for data.",
      "related": [
        "pre-training",
        "training-recipe",
        "org-meta"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "OLMo: Accelerating the Science of Language Models",
          "authors": "Groeneveld et al.",
          "venue": "ACL 2024",
          "arxiv": "2402.00838"
        }
      ],
      "resources": [
        {
          "label": "OLMo GitHub",
          "url": "https://github.com/allenai/OLMo"
        },
        {
          "label": "AI2 OLMo page",
          "url": "https://allenai.org/olmo"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "open-weights": {
      "id": "open-weights",
      "name": "Open Weights",
      "expansion": "Open Weights vs Fully Open Models",
      "category": "model-naming",
      "oneliner": "Models where the trained weights are publicly downloadable — distinct from fully open models that also release training data, code, and intermediate checkpoints.",
      "explanation": "Open weights means the model's trained parameters are publicly downloadable and usable, but the training data, code, and process may remain proprietary. Llama, Mistral, and Qwen are open-weight models. Fully open models like OLMo additionally release the complete training data, training code, intermediate checkpoints, and evaluation suite.",
      "related": [
        "olmo",
        "org-meta",
        "org-mistral",
        "license-types"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "org-bartowski": {
      "id": "org-bartowski",
      "name": "bartowski/",
      "expansion": "Currently most active broad quantizer",
      "category": "hf-organizations",
      "oneliner": "Produces GGUF (and sometimes EXL2/AWQ) quants within hours of new model releases. Consistent naming, comprehensive quant levels. Inherited TheBloke's role.",
      "seen_in": [
        "repo-name"
      ],
      "related": [
        "gguf",
        "imatrix",
        "org-thebloke"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "bartowski HuggingFace profile",
          "url": "https://huggingface.co/bartowski"
        }
      ],
      "explanation": "bartowski is a HuggingFace contributor who produces GGUF quantizations of new models typically within hours of their release. Each repo includes a comprehensive range of quant levels from Q2 to Q8, with consistent naming conventions and imatrix-calibrated variants for better quality at low bit widths. This speed and consistency have made bartowski the go-to source for GGUF files in the current open-weight ecosystem."
    },
    "org-deepseek": {
      "id": "org-deepseek",
      "name": "deepseek-ai/",
      "expansion": "DeepSeek — V2/V3, R1, Coder",
      "category": "hf-organizations",
      "oneliner": "Pioneered MLA (KV-cache compression) and DeepSeekMoE. DeepSeek-R1 brought open chain-of-thought reasoning rivaling o1. Chinese quant fund backed.",
      "seen_in": [
        "repo-name"
      ],
      "related": [
        "mla"
      ],
      "foundational_papers": [
        {
          "title": "DeepSeek-V2: A Strong, Economical, and Efficient MoE LM",
          "authors": "DeepSeek-AI",
          "venue": "2024",
          "arxiv": "2405.04434"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "DeepSeek is a Chinese AI lab that has pushed open-weight model architecture forward with innovations like Multi-head Latent Attention, which compresses KV caches to reduce memory use, and DeepSeekMoE, an efficient mixture-of-experts design. Their DeepSeek-R1 model brought open chain-of-thought reasoning to the community, showing step-by-step problem solving previously seen only in closed models like OpenAI's o1."
    },
    "org-google": {
      "id": "org-google",
      "name": "google/",
      "expansion": "Google DeepMind — Gemma, T5",
      "category": "hf-organizations",
      "oneliner": "Gemma 1/2/3/4, T5, FLAN-T5, PaLiGemma. Open-weight models from Gemini research. Strong performance relative to size.",
      "seen_in": [
        "repo-name"
      ],
      "related": [],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Google HuggingFace profile",
          "url": "https://huggingface.co/google"
        },
        {
          "label": "Gemma Technical Report",
          "url": "https://arxiv.org/abs/2403.08295"
        }
      ],
      "explanation": "Google has released several open-weight model families derived from its Gemini research, including the Gemma series, T5, FLAN-T5, and the multimodal PaLiGemma. Gemma models are known for strong performance relative to their size and come with permissive licenses suitable for commercial use. Google also contributes foundational research like the Transformer architecture itself and training techniques widely adopted across the field."
    },
    "org-meta": {
      "id": "org-meta",
      "name": "meta-llama/",
      "expansion": "Meta AI — Llama model family",
      "category": "hf-organizations",
      "oneliner": "Llama 1/2/3/4. The backbone of the open-weight ecosystem. Most fine-tuned, quantized, and experimented-with model family.",
      "seen_in": [
        "repo-name"
      ],
      "related": [],
      "foundational_papers": [
        {
          "title": "LLaMA: Open and Efficient Foundation Language Models",
          "authors": "Touvron et al.",
          "venue": "2023",
          "arxiv": "2302.13971"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "Meta AI is the research division behind the Llama family of open-weight language models, spanning Llama 1 through Llama 4. By releasing powerful models with permissive licenses, Meta catalyzed the entire open-weight ecosystem — most fine-tunes, quantizations, and serving tools were first built around Llama. The Llama 3 series matched or exceeded many closed models at equivalent sizes, making it the most widely fine-tuned base model family."
    },
    "org-microsoft": {
      "id": "org-microsoft",
      "name": "microsoft/",
      "expansion": "Microsoft — Phi models",
      "category": "hf-organizations",
      "oneliner": "Phi-1/2/3/4. Demonstrated 'textbook-quality' training data produces small models that punch far above size. MIT licensed.",
      "seen_in": [
        "repo-name"
      ],
      "related": [],
      "foundational_papers": [
        {
          "title": "Textbooks Are All You Need",
          "authors": "Gunasekar et al.",
          "venue": "2023",
          "arxiv": "2306.11644"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "Microsoft Research produces the Phi series of small language models which demonstrate that carefully curated, textbook-quality training data can make compact models perform far above their size class. Phi-3 Mini at 3.8B parameters rivals models several times larger on many benchmarks. The models are MIT-licensed, making them attractive for on-device and embedded deployments where size and licensing flexibility both matter."
    },
    "org-mistral": {
      "id": "org-mistral",
      "name": "mistralai/",
      "expansion": "Mistral AI — Mistral, Mixtral",
      "category": "hf-organizations",
      "oneliner": "Mistral 7B, Mixtral 8x7B/8x22B, Codestral. Pioneered SWA and open MoE. Apache 2.0 base models. Paris-based startup.",
      "seen_in": [
        "repo-name"
      ],
      "related": [],
      "foundational_papers": [
        {
          "title": "Mistral 7B",
          "authors": "Jiang et al.",
          "venue": "2023",
          "arxiv": "2310.06825"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "Mistral AI is a Paris-based startup that released Mistral 7B, which outperformed Llama 2 13B despite being half the size, putting the company on the map overnight. They pioneered sliding window attention for efficient long contexts and released Mixtral 8x7B and 8x22B, bringing sparse mixture-of-experts architectures to the open-weight world. They also produce Codestral for code generation and offer commercial API endpoints alongside open releases."
    },
    "org-mlx-community": {
      "id": "org-mlx-community",
      "name": "mlx-community/",
      "expansion": "Apple MLX format models",
      "category": "hf-organizations",
      "oneliner": "Community org publishing models in Apple MLX format for Apple Silicon Macs (M1-M4). Typically 4-bit or 8-bit MLX quants.",
      "seen_in": [
        "repo-name"
      ],
      "related": [
        "inference"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "mlx-community HuggingFace profile",
          "url": "https://huggingface.co/mlx-community"
        },
        {
          "label": "Apple MLX GitHub",
          "url": "https://github.com/ml-explore/mlx"
        }
      ],
      "explanation": "The MLX Community is a collaborative HuggingFace organization that publishes models converted to Apple's MLX format, optimized for Apple Silicon chips like M1 through M4. MLX leverages the unified memory architecture of these chips, allowing models to use both CPU and GPU memory seamlessly. The community ensures that popular new models are quickly available in MLX format, making it the main hub for Mac-native local inference."
    },
    "org-mradermacher": {
      "id": "org-mradermacher",
      "name": "Mradermacher/",
      "expansion": "High-volume automated GGUF quantizer",
      "category": "hf-organizations",
      "oneliner": "Quantizes a massive breadth of models including obscure fine-tunes. GGUF specialist. Useful when bartowski hasn't covered a model.",
      "seen_in": [
        "repo-name"
      ],
      "related": [
        "gguf",
        "org-bartowski",
        "quantization"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Mradermacher HuggingFace profile",
          "url": "https://huggingface.co/mradermacher"
        }
      ],
      "explanation": "mradermacher is a HuggingFace account that runs high-volume automated quantization pipelines, producing GGUF versions of a vast number of models including obscure fine-tunes and niche variants that other quantizers skip. While bartowski focuses on prominent releases with careful calibration, mradermacher prioritizes breadth and coverage. This makes the account a valuable last-resort source when searching for quantized versions of lesser-known models."
    },
    "org-nousresearch": {
      "id": "org-nousresearch",
      "name": "NousResearch/",
      "expansion": "Community collective — Hermes, Capybara",
      "category": "hf-organizations",
      "oneliner": "Hermes fine-tune series (strong instruction-following + function calling). One of the most respected community fine-tune recipes. ChatML format.",
      "seen_in": [
        "repo-name"
      ],
      "related": [
        "dataset-hermes",
        "model-merging"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "NousResearch HuggingFace profile",
          "url": "https://huggingface.co/NousResearch"
        }
      ],
      "explanation": "Nous Research is a community-driven lab known for the Hermes fine-tune series, which adds strong instruction-following and function-calling capabilities to popular base models like Llama and Mistral. Hermes models use the ChatML prompt format and are widely adopted for agent and tool-use applications. Nous also runs research into synthetic data generation and model merging techniques that benefit the broader open-weight community."
    },
    "org-qwen": {
      "id": "org-qwen",
      "name": "Qwen/",
      "expansion": "Alibaba Cloud — Qwen (Tongyi Qianwen)",
      "category": "hf-organizations",
      "oneliner": "Qwen 1.5/2/2.5 + QwQ reasoning model. Strong multilingual (CJK), competitive at every size class. Apache 2.0. Among strongest open-weight families.",
      "seen_in": [
        "repo-name"
      ],
      "related": [],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Qwen HuggingFace profile",
          "url": "https://huggingface.co/Qwen"
        },
        {
          "label": "Qwen2 Technical Report",
          "url": "https://arxiv.org/abs/2407.10671"
        }
      ],
      "explanation": "Qwen is Alibaba Cloud's open-weight model family, spanning the Qwen 1.5, 2, and 2.5 series plus the QwQ reasoning model. The models are released under Apache 2.0 and offer strong multilingual performance, especially in Chinese and English. Qwen 2.5 is competitive with leading models at every parameter count from 0.5B to 72B. The family also includes vision, code, and math specialist variants."
    },
    "org-thebloke": {
      "id": "org-thebloke",
      "name": "TheBloke/",
      "expansion": "Tom Jobbins — legacy quantizer",
      "category": "hf-organizations",
      "oneliner": "The original prolific HF quantizer (GPTQ, GGUF). Quantized hundreds of models mid-2023 through early 2024. Now largely inactive; replaced by bartowski.",
      "seen_in": [
        "repo-name"
      ],
      "related": [
        "gguf",
        "gptq",
        "org-bartowski",
        "quantization"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "TheBloke HuggingFace profile",
          "url": "https://huggingface.co/TheBloke"
        }
      ],
      "explanation": "TheBloke was the most prolific quantizer on HuggingFace during 2023, converting hundreds of models into GPTQ and GGUF formats so the community could run them on consumer hardware. At a time when most model releases were full-precision only, TheBloke's repos were often the first place users went to grab a runnable version. The account is now largely inactive, with the quantization mantle passed to bartowski and others."
    },
    "org-turboderp": {
      "id": "org-turboderp",
      "name": "turboderp/",
      "expansion": "ExLlamaV2 creator, EXL2 quantizer",
      "category": "hf-organizations",
      "oneliner": "Developer of ExLlamaV2 inference engine. Publishes EXL2-format quants with variable per-layer bitwidth. Best quality-per-bit on NVIDIA GPUs.",
      "seen_in": [
        "repo-name"
      ],
      "related": [
        "exl2",
        "inference",
        "inference-engine",
        "quantization",
        "tool-exllamav2"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "ExLlamaV2 GitHub",
          "url": "https://github.com/turboderp/exllamav2"
        }
      ],
      "explanation": "turboderp is the solo developer behind the ExLlamaV2 inference engine and the EXL2 quantization format. They maintain the engine, publish EXL2-format quantizations of popular models, and actively benchmark performance on consumer NVIDIA GPUs. Their work is particularly valued by users running 70B-class models on single GPUs like the RTX 4090, where the variable-bitwidth EXL2 format maximizes quality within tight VRAM budgets."
    },
    "orpo": {
      "id": "orpo",
      "name": "ORPO",
      "expansion": "Odds Ratio Preference Optimization",
      "category": "training-pipeline",
      "oneliner": "Combines SFT + preference alignment in one stage via odds-ratio penalty — no separate SFT, no reference model. Single-stage, single-model.",
      "explanation": "ORPO is a single-stage alignment training method that teaches a model to prefer good responses over bad ones without needing a separate supervised fine-tuning step first. It adds an odds-ratio penalty directly to the standard language modeling loss, so the model learns both instruction-following and preference alignment at the same time. This avoids the waste of first raising the probability of all responses then separating them. It costs roughly half as much as SFT plus DPO.",
      "fundamentals": "L = L_SFT + $\\lambda$·L_OR. L_OR = -log $\\sigma$(log(odds(y_w|x)/odds(y_l|x))). odds(y|x) = p(y|x)/(1-p(y|x)). $\\lambda$ 0.1-1.0. No reference model.",
      "seen_in": [
        "model-cards"
      ],
      "related": [
        "dpo",
        "sft",
        "simpo"
      ],
      "foundational_papers": [
        {
          "title": "ORPO: Monolithic Preference Optimization without Reference Model",
          "authors": "Hong et al.",
          "venue": "EMNLP 2024",
          "arxiv": "2403.07691"
        }
      ],
      "sources": [
        "Hong et al., arXiv:2403.07691"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "HuggingFace TRL docs",
          "url": "https://huggingface.co/docs/trl/en/index"
        }
      ]
    },
    "paged-attention": {
      "id": "paged-attention",
      "name": "PagedAttention",
      "expansion": "Paged Attention (vLLM)",
      "category": "attention-variants",
      "oneliner": "KV cache memory management borrowing OS virtual memory paging — eliminates fragmentation, achieves <4% memory waste, enables 2-4$\\times$ higher serving throughput.",
      "explanation": "PagedAttention manages the KV cache like virtual memory, dividing it into fixed-size blocks mapped through a page table. Naive KV cache allocation reserves maximum sequence length per request, wasting 60-80 percent of memory. PagedAttention allocates blocks on demand, frees them on completion, and does not require contiguous memory. This reduces waste to under 4 percent, fitting far more concurrent requests, and enables copy-on-write for shared prefixes like system prompts.",
      "fundamentals": "Example: Llama 7B, 512 KB/token cache, max_seq_len=2048, 24 GB free. Naive: 1 GB/request, 24 concurrent. PagedAttention (block=16 tokens, 8 MB/block): 3,072 blocks total. Request using 512 tokens = 32 blocks = 256 MB → 96 concurrent (4$\\times$ more). Page table: request_i logical_block_j → physical_block_k. Attention kernel gathers K/V via page table indirection. Copy-on-write: shared prefix blocks referenced by multiple requests; copied only when diverging. Waste: <4% (only last partially-filled block per request).",
      "seen_in": [
        "serving-config",
        "documentation"
      ],
      "related": [
        "flash-attention",
        "kv-cache",
        "token",
        "tool-vllm"
      ],
      "sources": [
        "Kwon et al., 'Efficient Memory Management for LLM Serving with PagedAttention,' SOSP 2023, arXiv:2309.06180"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
          "authors": "Kwon et al.",
          "venue": "SOSP 2023",
          "arxiv": "2309.06180"
        }
      ]
    },
    "peft": {
      "id": "peft",
      "name": "PEFT",
      "expansion": "Parameter-Efficient Fine-Tuning (concept + HuggingFace library)",
      "category": "fine-tuning-methods",
      "oneliner": "Umbrella term for all methods adapting LLMs by training a tiny fraction of params — AND the HuggingFace library (LoRA, QLoRA, DoRA, IA3, prefix tuning, etc.).",
      "explanation": "PEFT is a family of techniques for fine-tuning large models by updating only a small fraction of their parameters. Methods in this family include LoRA, bottleneck adapters, prompt tuning, and selective approaches like BitFit. The HuggingFace PEFT library wraps these methods into a standard interface where adapter weights are saved as small files rather than full model copies. It supports merging, multi-adapter inference, and quantized base models.",
      "fundamentals": "API: LoraConfig(r=16, lora_alpha=32, target_modules=[...]). Save: adapter_model.safetensors + adapter_config.json. Load: PeftModel.from_pretrained(). Merge: model.merge_and_unload(). Methods: LoRA, QLoRA, DoRA, AdaLoRA, LoRA+, rsLoRA, IA3, prefix tuning, prompt tuning, OFT, BOFT.",
      "seen_in": [
        "model-config",
        "filename",
        "code"
      ],
      "related": [
        "adapters",
        "dora",
        "ia3",
        "inference",
        "lora",
        "qlora",
        "safetensors"
      ],
      "foundational_papers": [
        {
          "title": "Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning",
          "authors": "Lialin et al.",
          "venue": "2023",
          "arxiv": "2303.15647"
        },
        {
          "title": "Parameter-Efficient Transfer Learning for NLP",
          "authors": "Houlsby et al.",
          "venue": "ICML 2019",
          "arxiv": "1902.00751"
        },
        {
          "title": "LoRA: Low-Rank Adaptation of Large Language Models",
          "authors": "Hu et al.",
          "venue": "ICLR 2022",
          "arxiv": "2106.09685"
        }
      ],
      "sources": [
        "github.com/huggingface/peft"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "per-group-quantization": {
      "id": "per-group-quantization",
      "name": "Per-Group Quantization",
      "expansion": "Group-wise Quantization Granularity",
      "category": "quantization-basics",
      "oneliner": "Divides weights into small groups (32-128 values) with separate scale factors per group — essential for making int4 viable.",
      "explanation": "Per-group quantization is a technique that divides a weight matrix into small groups and assigns each group its own scale factor for more accurate compression. With only 16 levels available in 4-bit format, a single scale factor for an entire matrix cannot capture local variations, so grouping is essential. Typical group sizes are 32 to 128 values, adding roughly 0.125 extra bits per weight in overhead. Finer groups give better quality but increase storage slightly.",
      "fundamentals": "Error depends on range: scale = range/(2^b - 1), max error = scale/2. Smaller groups have smaller local ranges → smaller scales → smaller errors. Overhead for g=128 with fp16 scales: 16/128 = 0.125 bits/weight. For g=32: 16/32 = 0.5 bits/weight.",
      "seen_in": [
        "quantization-config",
        "documentation"
      ],
      "related": [
        "awq",
        "calibration-data",
        "fp16",
        "gptq",
        "int4",
        "quantization"
      ],
      "sources": [
        "Nagel et al., arXiv:2106.08295",
        "Frantar et al., 'GPTQ,' arXiv:2210.17323"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "White Paper on Neural Network Quantization",
          "url": "https://arxiv.org/abs/2106.08295"
        }
      ]
    },
    "perplexity": {
      "id": "perplexity",
      "name": "Perplexity",
      "expansion": "Perplexity (Language Model Evaluation Metric)",
      "category": "scaling-patterns",
      "oneliner": "The standard metric for language model quality — the exponential of average cross-entropy loss. Lower is better.",
      "explanation": "Perplexity measures how well a language model predicts held-out text. It represents how many tokens the model is effectively choosing between at each step — a perplexity of 10 means the model is as uncertain as picking uniformly from 10 options. Lower is better. It appears on virtually every model card and quantization benchmark, where a 0.1-point increase indicates minimal quality loss and a 1.0-point increase indicates noticeable degradation.",
      "fundamentals": "$\\text{PPL} = \\exp\\left(-\\frac{1}{T}\\sum_{t=1}^{T} \\log p(x_t | x_{<t})\\right)$. For a perfect model predicting every token with probability 1: PPL = 1. Random prediction over vocabulary $V$: PPL = $|V|$. Typical LLM perplexity on WikiText-2: fp16 baseline ~5.5, Q4_K_M ~5.6 (+0.1), Q2_K ~6.5 (+1.0). Bits-per-character (BPC) is the log2 of perplexity divided by characters per token.",
      "related": [
        "pre-training",
        "quantization",
        "mmlu"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Perplexity of fixed-length models",
          "url": "https://huggingface.co/docs/transformers/perplexity"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "ple": {
      "id": "ple",
      "name": "PLE",
      "expansion": "Per-Layer Embeddings",
      "category": "layer-types",
      "oneliner": "Gemma 3/4 technique: each decoder layer has its own token embedding table, adding a per-layer per-token signal to the residual stream. Inflates total param count above 'effective' count.",
      "explanation": "Per-Layer Embeddings give each transformer layer its own token embedding lookup table instead of injecting token identity only at the input. In deep models the original token signal dilutes through many layers of transformation, so PLE provides a fresh identity signal at every layer. This explains why some models report a large gap between total and effective parameter counts. It is a relatively unique feature of the Gemma model family.",
      "fundamentals": "Standard: $h_0$ = Embed(token) + PE. PLE: each layer l adds PLE_l(token_id) to residual stream. PLE_l is a lookup table [V, d_ple]. If d_ple small (e.g., 128): overhead per layer = V$\\times$128. For V=256K, L=46 layers: 46$\\times$256K$\\times$128 $\\approx$ 1.5B params — substantial but not computed per-token in the same way as attention/FFN. Solves: token identity dilution in deep models, rare token handling, per-layer specialization based on token type.",
      "seen_in": [
        "model-config",
        "model-cards"
      ],
      "related": [
        "ffn",
        "residual-connection",
        "token"
      ],
      "sources": [
        "Google DeepMind, 'Gemma 3 Technical Report,' 2025, arXiv:2503.19786"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Gemma 3 Technical Report",
          "authors": "Google DeepMind",
          "venue": "2025",
          "arxiv": "2503.19786"
        }
      ]
    },
    "position-interpolation": {
      "id": "position-interpolation",
      "name": "Position Interpolation (PI)",
      "expansion": "Position Interpolation",
      "category": "position-encodings",
      "oneliner": "Extends context by linearly downscaling position indices so longer sequences map into the original training range — simple but damages high-frequency resolution.",
      "explanation": "Position Interpolation extends a model's context window by dividing all position indices by a scaling factor so they fit within the original training range. Extending from 2K to 32K context means dividing positions by 16, requiring only about 1000 fine-tuning steps to adjust. It was demonstrated by Meta on Llama with 16x context extension. It was later superseded by NTK-aware scaling and YaRN, which avoid compressing all frequency bands equally.",
      "fundamentals": "Modified angle: (m/s)·$\\theta_i$ instead of m·$\\theta_i$. Effective position m'=m/s. For L'=s·L: positions [0, L'-1] → effective [0, L-1/s] ⊂ [0, L). All within training range. Fine-tuning needed: adjacent tokens now separated by 1/s in position space (was 1). Comparison: PI uniformly compresses all frequencies by s. NTK preserves high-freq, stretches low-freq. YaRN: per-band treatment. In HuggingFace: called 'linear' scaling, not 'pi'.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "rope",
        "ntk-rope",
        "yarn",
        "abf"
      ],
      "sources": [
        "Chen et al., 'Extending Context Window of Large Language Models via Positional Interpolation,' 2023, arXiv:2306.15595"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Extending Context Window of Large Language Models via Positional Interpolation",
          "authors": "Chen et al.",
          "venue": "2023",
          "arxiv": "2306.15595"
        }
      ]
    },
    "post-training": {
      "id": "post-training",
      "name": "Post-Training",
      "expansion": "Post-Training (the full alignment and safety pipeline after pre-training)",
      "category": "training-pipeline",
      "oneliner": "Everything that happens to a model after pre-training — SFT, preference alignment, safety training, and capability fine-tuning — turning a base model into a deployable assistant.",
      "explanation": "Post-training is the umbrella term for all training stages that follow pre-training, transforming a raw base model into a safe, helpful assistant. A typical post-training recipe includes supervised fine-tuning on instruction data, preference alignment via DPO or RLHF, safety training with red-team data, and capability-specific tuning for code, math, or tool use.",
      "fundamentals": "Typical pipeline: SFT (instruction following, 10K-1M examples) → preference alignment (DPO/RLHF, 50K-500K pairs) → safety training (red-team data, refusal training) → capability tuning (code, math, tool use). Some labs iterate: Llama 2 used multiple rounds of RLHF with rejection sampling. Data quality matters more than quantity at this stage (LIMA principle). Post-training can degrade pre-training capabilities (alignment tax), so careful evaluation between stages is essential.",
      "related": [
        "sft",
        "dpo",
        "rlhf",
        "pre-training",
        "red-teaming"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Training language models to follow instructions with human feedback (InstructGPT)",
          "authors": "Ouyang et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2203.02155"
        }
      ],
      "resources": [
        {
          "label": "Llama 2 post-training details",
          "url": "https://arxiv.org/abs/2307.09288"
        },
        {
          "label": "Sebastian Raschka — Understanding Reasoning LLMs",
          "url": "https://magazine.sebastianraschka.com/p/understanding-reasoning-llms"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "pre-norm": {
      "id": "pre-norm",
      "name": "Pre-Norm vs Post-Norm",
      "expansion": "Pre-Layer Normalization vs Post-Layer Normalization",
      "category": "layer-types",
      "oneliner": "Where normalization sits: before the sub-layer (Pre-Norm, modern standard — stable training) or after the residual add (Post-Norm, original Transformer — fragile at depth).",
      "explanation": "Pre-norm places layer normalization before each sublayer rather than after it, so the residual path carries un-normalized activations directly through skip connections. This gives gradients an unimpeded path regardless of network depth, making training far more stable. The original Transformer used post-norm, but GPT-2 switched to pre-norm and virtually all modern LLMs have followed. The tradeoff is slightly worse converged loss compared to well-tuned post-norm.",
      "fundamentals": "Post-Norm: output = Norm(x + Sublayer(x)). Gradient path goes through Norm Jacobian at every layer. Pre-Norm: output = x + Sublayer(Norm(x)). Gradient: ∂L/∂xₗ = ∂L/∂x_L·(I + $\\Sigma$∂F_k/∂x). The I (identity) ensures gradient magnitude ≥ |∂L/∂x_L| — prevents vanishing regardless of depth. Timeline: Post-Norm (2017 Transformer, BERT) → Pre-Norm+LayerNorm (GPT-2/3, 2019-20) → Pre-Norm+RMSNorm (LLaMA+, 2023+).",
      "seen_in": [
        "code",
        "architecture-papers"
      ],
      "related": [
        "rmsnorm",
        "layernorm",
        "residual-connection"
      ],
      "sources": [
        "Xiong et al., 'On Layer Normalization in the Transformer Architecture,' ICML 2020, arXiv:2002.04745"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "On Layer Normalization in the Transformer Architecture",
          "authors": "Xiong et al.",
          "venue": "ICML 2020",
          "arxiv": "2002.04745"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Raschka — LLM Architecture Gallery",
          "url": "https://sebastianraschka.com/llm-architecture-gallery/"
        }
      ]
    },
    "pre-training": {
      "id": "pre-training",
      "name": "Pre-training",
      "expansion": "Pre-training (autoregressive language modeling)",
      "category": "training-pipeline",
      "oneliner": "Training a model from scratch on massive text via next-token prediction — the most expensive stage, producing a base model with no instruction-following ability.",
      "explanation": "Pre-training teaches a model to predict the next token on massive text corpora, typically hundreds of billions to trillions of tokens, with no human labels needed. The result is a base model that can complete text but does not follow instructions. Scaling laws suggest roughly 20 tokens per parameter, though modern practice over-trains smaller models for cheaper inference. Pre-training is the most expensive phase, costing millions of GPU-hours for large models.",
      "fundamentals": "Loss: $\\mathcal{L} = -\\frac{1}{T}\\sum_{t=1}^{T} \\log p_\\theta(x_t | x_1 \\ldots x_{t-1})$. Standard cross-entropy over vocabulary at each position. Data: raw text, tokenized (BPE/SPM), packed into fixed-length sequences (2K-128K tokens). Hyperparams: lr 1e-4 to 3e-4 (cosine decay), batch 4M-16M tokens/step, AdamW ($\\beta_1$=0.9, $\\beta_2$=0.95, wd=0.1), bf16 mixed precision. Compute: $\\approx 6ND$ FLOPs (N=params, D=tokens). 70B on 2T tokens $\\approx 8.4 \\times 10^{23}$ FLOPs.",
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "related": [
        "bf16",
        "cpt",
        "inference",
        "sft",
        "token",
        "tokenizer-bpe"
      ],
      "foundational_papers": [
        {
          "title": "Attention Is All You Need",
          "authors": "Vaswani et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03762"
        },
        {
          "title": "Language Models are Few-Shot Learners (GPT-3)",
          "authors": "Brown et al.",
          "venue": "NeurIPS 2020",
          "arxiv": "2005.14165"
        },
        {
          "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
          "authors": "Hoffmann et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2203.15556"
        },
        {
          "title": "LLaMA: Open and Efficient Foundation Language Models",
          "authors": "Touvron et al.",
          "venue": "2023",
          "arxiv": "2302.13971"
        }
      ],
      "sources": [
        "Vaswani et al., arXiv:1706.03762",
        "Brown et al., arXiv:2005.14165",
        "Hoffmann et al., arXiv:2203.15556"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "prefix-caching": {
      "id": "prefix-caching",
      "name": "Prefix Caching",
      "expansion": "Automatic Prefix Caching (APC)",
      "category": "scaling-patterns",
      "oneliner": "Reuse precomputed KV cache blocks for shared prompt prefixes (system prompts, few-shot examples) across requests, avoiding redundant prefill computation.",
      "explanation": "Prefix caching stores the KV cache blocks produced during prompt prefill and reuses them when a subsequent request shares the same token prefix. Because the KV computation for any token depends only on preceding tokens, identical prefixes yield identical cache entries. This is especially effective for system prompts, few-shot templates, and multi-turn conversations where a large shared context is prepended to each query.",
      "fundamentals": "Each KV block is identified by a hash of the tokens it covers plus all preceding tokens. A global hash table maps hashes to physical GPU memory blocks. On a new request the scheduler walks the token sequence block-by-block: cache hit $\\rightarrow$ skip prefill for that block; cache miss $\\rightarrow$ compute and store. Eviction follows LRU. Best practice: place static content (system prompt, instructions) at the beginning to maximise prefix overlap. SGLang RadixAttention uses a trie structure enabling shared caching for tree-shaped conversation histories.",
      "related": [
        "kv-cache",
        "paged-attention",
        "continuous-batching"
      ],
      "seen_in": [
        "serving-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
          "authors": "Kwon et al.",
          "venue": "SOSP 2023",
          "arxiv": "2309.06180"
        },
        {
          "title": "SGLang: Efficient Execution of Structured Language Model Programs",
          "authors": "Zheng et al.",
          "venue": "arXiv 2024",
          "arxiv": "2312.07104"
        }
      ],
      "resources": [
        {
          "label": "vLLM APC docs",
          "url": "https://docs.vllm.ai/en/stable/design/prefix_caching/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "prompt-engineering": {
      "id": "prompt-engineering",
      "name": "Prompt Engineering",
      "expansion": "Prompt Engineering",
      "category": "prompting",
      "oneliner": "The practice of designing input prompts to elicit desired behavior from LLMs — selecting wording, structure, examples, and constraints without changing model weights.",
      "explanation": "Prompt engineering is the practice of crafting inputs that guide a language model toward desired outputs without modifying its weights. It encompasses choosing the right phrasing, structuring instructions clearly, providing examples, and setting constraints. Techniques range from simple rewording to sophisticated strategies like chain-of-thought prompting, few-shot examples, and role-playing personas.",
      "related": [
        "few-shot",
        "zero-shot",
        "chain-of-thought",
        "system-prompt",
        "context-engineering"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Lilian Weng — Prompt Engineering",
          "url": "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/"
        },
        {
          "label": "Anthropic prompt engineering guide",
          "url": "https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "prompt-injection": {
      "id": "prompt-injection",
      "name": "Prompt Injection",
      "expansion": "Prompt Injection Attack",
      "category": "safety-alignment",
      "oneliner": "An attack where crafted input manipulates an LLM into ignoring its system instructions or executing unintended actions — the top LLM security vulnerability.",
      "explanation": "Prompt injection is an attack technique where adversarial text in the user input causes the model to override its system prompt instructions. Direct injection explicitly tells the model to ignore previous instructions. Indirect injection hides malicious instructions in retrieved documents or tool outputs that the model processes. It is ranked as the number one vulnerability in the OWASP Top 10 for LLM Applications.",
      "related": [
        "red-teaming",
        "guardrails",
        "tag-guard",
        "agentic-ai"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "OWASP LLM Top 10 — Prompt Injection",
          "url": "https://genai.owasp.org/llmrisk/llm01-prompt-injection/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "pruning": {
      "id": "pruning",
      "name": "Pruning",
      "expansion": "Model Pruning (Weight / Neuron Removal)",
      "category": "scaling-patterns",
      "oneliner": "Removing unnecessary weights, neurons, or entire layers from a trained model to make it smaller and faster without retraining from scratch.",
      "explanation": "Pruning reduces a model's size by removing parameters that contribute least to its output. Unstructured pruning zeros out individual weights, creating sparse matrices. Structured pruning removes entire neurons, heads, or layers, producing a smaller dense model that runs faster on standard hardware. LLaMA 3.2 1B and 3B were created by structured pruning of the 8B model followed by distillation. Pruning is complementary to quantization.",
      "fundamentals": "Magnitude pruning: remove weights with smallest absolute value. Movement pruning: remove weights that move toward zero during fine-tuning. Structured pruning: remove entire rows/columns from weight matrices based on importance scores. Width pruning (neurons/heads) vs depth pruning (layers). Wanda (Weights AND Activations): considers both weight magnitude and activation magnitude for better pruning decisions without retraining.",
      "related": [
        "knowledge-distillation",
        "quantization",
        "model-size-memory"
      ],
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "foundational_papers": [
        {
          "title": "Wanda: A Simple and Effective Pruning Approach for Large Language Models",
          "authors": "Sun et al.",
          "venue": "2024",
          "arxiv": "2306.11695"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "ptq": {
      "id": "ptq",
      "name": "PTQ",
      "expansion": "Post-Training Quantization",
      "category": "quantization-basics",
      "oneliner": "Quantize an already-trained model without further training — fast (minutes to hours), practical, dominates LLM quantization.",
      "explanation": "Post-training quantization is a technique that converts a fully trained model to lower numerical precision in a single pass without any additional training. It requires no training infrastructure, just a trained model and optionally a small calibration dataset. Popular PTQ methods include GPTQ, AWQ, RTN, and EXL2. At 8-bit precision the quality loss is essentially zero, at 4-bit it is measurable but acceptable for most uses.",
      "fundamentals": "Workflow: 1) Load fp16/bf16 model. 2) Optional calibration pass. 3) Compute scale/zero-point per layer/channel/group. 4) Quantize: W_q = round(W/scale) + zp. 5) Save. At inference: dequantize and compute.",
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "related": [
        "awq",
        "bf16",
        "calibration-data",
        "exl2",
        "fp16",
        "gptq",
        "inference",
        "qat",
        "quantization"
      ],
      "sources": [
        "Nagel et al., arXiv:2106.08295"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "White Paper on Neural Network Quantization",
          "url": "https://arxiv.org/abs/2106.08295"
        }
      ]
    },
    "pydantic-ai": {
      "id": "pydantic-ai",
      "name": "Pydantic AI / Instructor",
      "expansion": "Type-Safe Structured Output from LLMs",
      "category": "agents-and-tools",
      "oneliner": "Libraries that force LLM outputs into validated Python types using Pydantic models, eliminating manual JSON parsing and handling retries automatically.",
      "explanation": "Pydantic AI and Instructor are libraries that extract structured, type-safe data from language model outputs. You define a Pydantic model describing the expected output schema, and the library handles prompt construction, JSON mode activation, parsing, and automatic retry on validation failure. This replaces fragile regex-based output parsing with reliable, schema-validated responses. Instructor works with OpenAI, Anthropic, and open-source models.",
      "fundamentals": "Core flow: define Pydantic model → library injects schema into system prompt or tool definition → model generates JSON → library validates against schema → retry with error feedback if invalid. Uses function calling / tool use APIs where available, falls back to JSON mode. Instructor: lightweight, focused on extraction. Pydantic AI: full agent framework with typed dependencies, streaming, and multi-step workflows. Both leverage Pydantic v2 for fast validation.",
      "related": [
        "function-calling",
        "agentic-ai",
        "guardrails"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Instructor library",
          "url": "https://github.com/jxnl/instructor"
        },
        {
          "label": "Pydantic AI docs",
          "url": "https://ai.pydantic.dev"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "qat": {
      "id": "qat",
      "name": "QAT",
      "expansion": "Quantization-Aware Training",
      "category": "quantization-basics",
      "oneliner": "Simulate quantization during training so the model learns to be robust to quantization noise — better quality than PTQ but requires full training infrastructure.",
      "explanation": "Quantization-Aware Training simulates low-precision math during the training forward pass so the model learns weights that survive quantization gracefully. Gradients still flow in full precision using a straight-through estimator to approximate rounding. The result is noticeably better quality than post-training quantization at 4-bit and below, but it requires full training infrastructure to run.",
      "fundamentals": "Forward: W_fake_q = dequantize(quantize(W)), Y = X @ W_fake_q. Backward: STE approximates d(round(x))/dx $\\approx$ 1 — gradients pass through as if rounding wasn't there. Optimizer updates full-precision master weights. The model learns weights that happen to quantize well.",
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "related": [
        "ptq",
        "quantization",
        "spinquant"
      ],
      "sources": [
        "Jacob et al., CVPR 2018, arXiv:1712.05877",
        "Liu et al., 'LLM-QAT,' arXiv:2305.17888"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference",
          "authors": "Jacob et al.",
          "venue": "CVPR 2018",
          "arxiv": "1712.05877"
        }
      ]
    },
    "qlora": {
      "id": "qlora",
      "name": "QLoRA",
      "expansion": "Quantized Low-Rank Adaptation",
      "category": "fine-tuning-methods",
      "oneliner": "Base model in 4-bit NF4 + LoRA in fp16. Fine-tune 65B on a single 48GB GPU. Three innovations: NF4, double quantization, paged optimizers.",
      "explanation": "QLoRA is a memory-efficient fine-tuning method that combines 4-bit quantization of the frozen base model with small LoRA adapter layers trained in higher precision. The base weights are stored in NF4 format, which is optimal for normally distributed values, while adapter weights run in bfloat16 to compensate for quantization error. A 65B model shrinks from around 130 GB to about 41 GB, making fine-tuning feasible on consumer GPUs with quality matching full-precision training.",
      "fundamentals": "Forward: h = Dequant_NF4($W_0$_q)·x + ($\\alpha$/r)·B·A·x. Memory 65B: ~33.5GB base + ~1.6GB adapters/optimizer + ~4-8GB activations $\\approx$ 40-44GB. NF4: 16 levels at N(0,1) quantile positions. Double quant: block-64 absmax → quantize to 8-bit in blocks of 256.",
      "seen_in": [
        "model-name",
        "model-config"
      ],
      "related": [
        "adapters",
        "bitsandbytes",
        "lora",
        "nf4",
        "quantization"
      ],
      "foundational_papers": [
        {
          "title": "QLoRA: Efficient Finetuning of Quantized Language Models",
          "authors": "Dettmers et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2305.14314"
        },
        {
          "title": "The case for 4-bit precision: k-bit Inference Scaling Laws",
          "authors": "Dettmers & Zettlemoyer",
          "venue": "ICML 2023",
          "arxiv": "2212.09720"
        }
      ],
      "sources": [
        "Dettmers et al., arXiv:2305.14314"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Sebastian Raschka — Practical Tips for LoRA",
          "url": "https://magazine.sebastianraschka.com/p/practical-tips-for-finetuning-llms"
        },
        {
          "label": "HuggingFace PEFT quantization guide",
          "url": "https://huggingface.co/docs/peft/en/developer_guides/quantization"
        }
      ]
    },
    "quantization": {
      "id": "quantization",
      "name": "Quantization",
      "expansion": "Model Quantization / Neural Network Quantization",
      "category": "quantization-basics",
      "oneliner": "Converting a model's high-precision floats to lower-precision representations — the single most impactful optimization for LLM serving.",
      "explanation": "Quantization converts a model's weights from 16-bit or 32-bit floats to smaller formats like 8-bit or 4-bit integers. This matters because LLM inference is memory-bandwidth-bound: every generated token requires reading all weights from memory, so smaller weights mean faster generation. A 70B-parameter model shrinks from 140 GB to roughly 35 GB at 4-bit. Quality at 8-bit is nearly lossless; at 4-bit there is small but measurable degradation.",
      "fundamentals": "Uniform affine quantization: $x_q = \\text{clamp}(\\text{round}(x/s + z),\\, q_{\\min},\\, q_{\\max})$. Dequantize: $\\hat{x} = s \\cdot (x_q - z)$. Scale $s = (x_{\\max} - x_{\\min})/(q_{\\max} - q_{\\min})$. Symmetric: $z=0$, $s = \\max(|x|)/q_{\\max}$. Error per value bounded by $s/2$. Not all weights contribute equally — attention layers and first/last layers are typically more sensitive.",
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "related": [
        "calibration-data",
        "dequantization",
        "inference",
        "ptq",
        "qat",
        "token"
      ],
      "sources": [
        "Gholami et al., 'A Survey of Quantization Methods,' arXiv:2103.13630"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "A Survey of Quantization Methods for Efficient Neural Network Inference",
          "authors": "Gholami et al.",
          "venue": "2021",
          "arxiv": "2103.13630"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace — Quantization overview",
          "url": "https://huggingface.co/docs/transformers/en/main_classes/quantization"
        },
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ]
    },
    "quanto": {
      "id": "quanto",
      "name": "Quanto (optimum-quanto)",
      "expansion": "HuggingFace's quantization library",
      "category": "quantization-methods",
      "oneliner": "HuggingFace's own quantization library — supports int2/int4/int8/float8 with a clean Pythonic API. Device-agnostic.",
      "explanation": "Quanto is a quantization toolkit from HuggingFace designed to be simple, composable, and hardware-agnostic. It supports both weight-only and weight-plus-activation quantization and runs on CPU, CUDA, and Apple MPS without code changes. It integrates directly with HuggingFace Transformers and Diffusers for easy experimentation. It trades peak throughput for ease of use compared to more specialized tools like GPTQ and AWQ.",
      "fundamentals": "Provides quantized tensor types that wrap standard PyTorch tensors with scale/zero-point metadata. Quantization is eager (happens at model load time). Supports symmetric and affine quantization, per-tensor and per-axis granularity.",
      "seen_in": [
        "quantization-config",
        "documentation"
      ],
      "related": [
        "awq",
        "bitsandbytes",
        "gptq",
        "quantization"
      ],
      "sources": [
        "github.com/huggingface/optimum-quanto"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "quip-sharp": {
      "id": "quip-sharp",
      "name": "QuIP#",
      "expansion": "Quantization with Incoherence Processing (sharp)",
      "category": "quantization-methods",
      "oneliner": "2-bit quantization using random rotations to spread information evenly across weights, then vector quantization with E8 lattice codebooks.",
      "explanation": "QuIP# is a weight quantization method that applies random orthogonal rotations to model weights before compressing them, making the values more uniform and easier to quantize accurately. The rotations use fast Hadamard transforms to ensure no single weight is disproportionately important, a property called incoherence. After rotation, lattice-based vector quantization encodes the weights at very low bitwidths. It achieves viable 2-bit compression for 70B-plus models.",
      "fundamentals": "1) Rotate weights: W' = UWV^T (random orthogonal U, V from Hadamard matrices). 2) Vector quantize rotated weights using E8 lattice codebook. 3) At inference: decode codebook entries and apply inverse rotations. The rotation is the key — it makes all weights roughly equally important, so uniform quantization works better.",
      "seen_in": [
        "model-name",
        "research-repos"
      ],
      "related": [
        "aqlm",
        "inference",
        "quantization"
      ],
      "sources": [
        "Chee et al., 'QuIP#,' arXiv:2402.04396"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "QuIP#: Even Better LLM Quantization with Hadamard Incoherence and Lattice Codebooks",
          "authors": "Chee et al.",
          "venue": "ICML 2024",
          "arxiv": "2402.04396"
        }
      ]
    },
    "rag": {
      "id": "rag",
      "name": "RAG",
      "expansion": "Retrieval-Augmented Generation",
      "category": "scaling-patterns",
      "oneliner": "Augment an LLM with retrieved documents at inference time so it can answer questions grounded in external knowledge without retraining.",
      "explanation": "Retrieval-Augmented Generation is an inference pattern that pairs a language model with an external retrieval system. When a query arrives, a retriever (typically a dense embedding model) fetches relevant documents from a knowledge base, and those documents are concatenated into the prompt before generation. This grounds the model's output in up-to-date, verifiable facts, reducing hallucination and eliminating the need to bake all knowledge into model weights.",
      "fundamentals": "Pipeline: query $\\rightarrow$ embed(query) $\\rightarrow$ vector search top-$k$ $\\rightarrow$ concat(docs, query) $\\rightarrow$ LLM generate. Retriever: bi-encoder (e.g. DPR) maps query and documents to shared embedding space; cosine similarity selects top-$k$. Generator: any causal LM conditioned on retrieved context. Original RAG formulation: $p(y|x) = \\sum_{z \\in \\text{top-}k} p(z|x)\\,p(y|x,z)$, marginalising over retrieved documents $z$. Chunking strategy, chunk size, overlap, and reranking significantly affect quality.",
      "related": [
        "token",
        "inference",
        "tag-embed"
      ],
      "seen_in": [
        "code",
        "serving-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
          "authors": "Lewis et al.",
          "venue": "NeurIPS 2020",
          "arxiv": "2005.11401"
        }
      ],
      "resources": [
        {
          "label": "Original paper",
          "url": "https://arxiv.org/abs/2005.11401"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "react": {
      "id": "react",
      "name": "ReAct",
      "expansion": "ReAct (Reasoning + Acting)",
      "category": "agents-and-tools",
      "oneliner": "A prompting framework that interleaves reasoning traces with action steps, enabling LLMs to think about a problem and use tools in alternating steps.",
      "explanation": "ReAct is a prompting strategy that combines chain-of-thought reasoning with tool use in an interleaved loop. At each step, the model generates a thought (reasoning about what to do next), then an action (calling a tool like search or calculator), then observes the result, and repeats until it has enough information to answer. This think-act-observe loop is the foundation of most LLM agent architectures.",
      "related": [
        "agentic-ai",
        "chain-of-thought",
        "function-calling",
        "tool-langchain"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
          "authors": "Yao et al.",
          "venue": "ICLR 2023",
          "arxiv": "2210.03629"
        }
      ],
      "resources": [
        {
          "label": "ReAct paper",
          "url": "https://arxiv.org/abs/2210.03629"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "reasoning-models": {
      "id": "reasoning-models",
      "name": "Reasoning Models",
      "expansion": "Reasoning Models (o1/R1 paradigm)",
      "category": "scaling-patterns",
      "oneliner": "LLMs trained via RL to produce extended chain-of-thought before answering — trading inference compute for dramatically better accuracy on hard problems.",
      "explanation": "Reasoning models are LLMs trained via reinforcement learning to produce extended chains of thought before answering. They decompose problems into steps, verify intermediate results, and backtrack when stuck. This paradigm emerged in late 2024 with OpenAI o1, followed by DeepSeek R1 and Qwen QwQ. On AIME 2024, reasoning models jumped from 15 to over 70 percent accuracy, rivaling competitive math students.",
      "fundamentals": "Core loop: given prompt $x$, generate reasoning trace $z_1, z_2, \\ldots, z_T$ then answer $y$. Trained via RL (PPO or GRPO) with reward $r$ based on answer correctness: $\\max_{\\theta} \\mathbb{E}_{x}\\left[r(x, y) - \\beta\\,\\mathrm{KL}(\\pi_\\theta \\| \\pi_{\\mathrm{ref}})\\right]$. No SFT on reasoning traces needed — R1-Zero showed reasoning emerges purely from RL on a pretrained base. Inference cost scales with problem difficulty: easy questions use hundreds of reasoning tokens, hard ones use tens of thousands.",
      "related": [
        "chain-of-thought",
        "test-time-compute",
        "grpo",
        "rlvr",
        "thinking-tokens",
        "rlhf",
        "knowledge-distillation"
      ],
      "seen_in": [
        "model-cards",
        "API-docs",
        "benchmarks"
      ],
      "foundational_papers": [
        {
          "title": "Learning to reason with LLMs (OpenAI o1 system card)",
          "authors": "OpenAI",
          "venue": "2024",
          "arxiv": ""
        },
        {
          "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
          "authors": "DeepSeek-AI",
          "venue": "2025",
          "arxiv": "2501.12948"
        }
      ],
      "resources": [
        {
          "label": "OpenAI reasoning models guide",
          "url": "https://platform.openai.com/docs/guides/reasoning"
        },
        {
          "label": "HuggingFace LLM Course: DeepSeek R1",
          "url": "https://huggingface.co/learn/llm-course/en/chapter12/3"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "red-teaming": {
      "id": "red-teaming",
      "name": "Red Teaming",
      "expansion": "Red Teaming (Adversarial Testing of LLMs)",
      "category": "scaling-patterns",
      "oneliner": "Systematically probing a language model for failures, biases, and safety vulnerabilities through adversarial prompting before deployment.",
      "explanation": "Red teaming is the practice of deliberately trying to make a language model produce harmful, incorrect, or undesirable outputs. Human red teamers craft adversarial prompts that probe for jailbreaks, bias, hallucinations, and policy violations. Automated red teaming uses one model to generate attacks against another at scale. Red teaming is a standard step in the post-training safety pipeline at major labs — both Anthropic and OpenAI publish red teaming reports alongside model releases.",
      "fundamentals": "Manual red teaming: domain experts systematically probe categories (violence, deception, bias, privacy, CBRN). Automated red teaming: an attacker model generates adversarial prompts, optimized via gradient-based search (GCG), genetic algorithms, or RL. Jailbreak categories: role-playing, encoding tricks, multi-turn escalation, indirect injection. Defense: safety training (RLHF with red team data), input/output filters, constitutional AI.",
      "related": [
        "evals",
        "rlhf",
        "tag-guard",
        "guardrails"
      ],
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "foundational_papers": [
        {
          "title": "Red Teaming Language Models to Reduce Harms",
          "authors": "Ganguli et al.",
          "venue": "2022",
          "arxiv": "2209.07858"
        }
      ],
      "resources": [
        {
          "label": "Anthropic red teaming paper",
          "url": "https://arxiv.org/abs/2209.07858"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "repetition-penalty": {
      "id": "repetition-penalty",
      "name": "Repetition Penalty",
      "expansion": "Repetition Penalty / Frequency Penalty",
      "category": "sampling-decoding",
      "oneliner": "An inference-time parameter that reduces the probability of tokens already generated, preventing the model from repeating phrases or getting stuck in loops.",
      "explanation": "Repetition penalty is applied during decoding to discourage the model from generating the same tokens or phrases repeatedly. It works by dividing the logit of any previously generated token by a penalty factor (typically 1.1-1.3), making it less likely to be selected again. Frequency penalty variants scale the penalty by how many times a token has appeared. Without repetition penalty, models often degenerate into repetitive loops, especially with greedy or low-temperature decoding.",
      "related": [
        "temperature",
        "greedy-decoding",
        "top-p"
      ],
      "seen_in": [
        "model-config",
        "code"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "residual-connection": {
      "id": "residual-connection",
      "name": "Residual Connection",
      "expansion": "Residual (Skip) Connection",
      "category": "layer-types",
      "oneliner": "The output = x + F(x) pattern — adds sub-layer input directly to output, enabling gradient flow through arbitrarily deep networks. Zero additional parameters.",
      "explanation": "A residual connection adds a layer's input directly to its output, so each layer only needs to learn a small correction rather than a full transformation. This creates a clean gradient highway through the network, preventing the vanishing gradients that make deep networks impossible to train. In transformers, every attention and feed-forward block is wrapped in a residual connection, forming a shared communication channel that all layers read from and write to.",
      "fundamentals": "x_L = x_0 + $\\Sigma$ F_l(x_{l-1}). Gradient: ∂L/∂x_l = ∂L/∂x_L·(I + $\\Sigma$∂F_k/∂x) — the I term prevents vanishing. Without residuals: ∂L/∂x_0 = Π∂F_l/∂x_{l-1} — product of L Jacobians, exponentially vanishes/explodes. Pre-Norm: output = x + Sublayer(Norm(x)) — raw x flows through skip. Some models scale residuals at init: GPT-2 scales by 1/$\\sqrt{2n_layers}$. DeepNorm: x·$\\alpha$ + Sublayer(Norm(x)) with $\\alpha$>1 for very deep Post-Norm training.",
      "seen_in": [
        "code",
        "architecture-diagrams"
      ],
      "related": [
        "layernorm",
        "pre-norm",
        "rmsnorm"
      ],
      "sources": [
        "He et al., 'Deep Residual Learning for Image Recognition,' CVPR 2016, arXiv:1512.03385",
        "Elhage et al., 'A Mathematical Framework for Transformer Circuits,' Anthropic 2021"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Deep Residual Learning for Image Recognition",
          "authors": "He et al.",
          "venue": "CVPR 2016",
          "arxiv": "1512.03385"
        }
      ],
      "resources": [
        {
          "label": "Jay Alammar — The Illustrated Transformer",
          "url": "https://jalammar.github.io/illustrated-transformer/"
        }
      ]
    },
    "reward-model": {
      "id": "reward-model",
      "name": "Reward Model",
      "expansion": "Reward Model (RM)",
      "category": "safety-alignment",
      "oneliner": "A model trained on human preference data to score the quality of LLM responses, providing the optimization signal for RLHF alignment training.",
      "explanation": "A reward model is a neural network trained to predict which of two responses a human would prefer, given a prompt. It takes a prompt-response pair and outputs a scalar score indicating quality. The reward model is the bridge between human preferences and the mathematical optimization in RLHF — it translates subjective judgments into a differentiable training signal. It is typically initialized from the SFT model with the language modeling head replaced by a scalar output.",
      "related": [
        "rlhf",
        "dpo",
        "tag-reward",
        "rlaif"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "Training language models to follow instructions with human feedback",
          "authors": "Ouyang et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2203.02155"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace TRL — Reward modeling",
          "url": "https://huggingface.co/docs/trl/en/reward_trainer"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "rlaif": {
      "id": "rlaif",
      "name": "RLAIF",
      "expansion": "Reinforcement Learning from AI Feedback",
      "category": "safety-alignment",
      "oneliner": "Using an LLM to generate preference labels instead of human annotators, making RLHF-style alignment scalable without expensive human data collection.",
      "explanation": "RLAIF replaces human annotators in the RLHF pipeline with an AI model that generates preference judgments. A strong model like GPT-4 or Claude compares two candidate responses and decides which is better, producing the same kind of preference pairs that humans would generate. This dramatically reduces the cost and speed of collecting alignment data. Constitutional AI from Anthropic is a specific form of RLAIF where the judge model uses explicit written principles.",
      "related": [
        "rlhf",
        "dpo",
        "constitutional-ai",
        "evals"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "RLAIF: Scaling Reinforcement Learning from Human Feedback with AI Feedback",
          "authors": "Lee et al.",
          "venue": "2023",
          "arxiv": "2309.00267"
        }
      ],
      "resources": [
        {
          "label": "Cameron Wolfe — RLAIF overview",
          "url": "https://cameronrwolfe.substack.com/p/rlaif-reinforcement-learning-from"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "rlhf": {
      "id": "rlhf",
      "name": "RLHF",
      "expansion": "Reinforcement Learning from Human Feedback",
      "category": "training-pipeline",
      "oneliner": "Train reward model from preferences, optimize LLM via PPO + KL penalty. Complex (4 models in memory) but the original alignment breakthrough.",
      "explanation": "RLHF is a training procedure that aligns language models with human preferences using reinforcement learning. It works in three stages: supervised fine-tuning on demonstrations, training a reward model on human preference comparisons, and optimizing the policy against that reward using PPO with a KL penalty. Four models run simultaneously during the final stage. It powers ChatGPT, Claude, and Llama 2-Chat, though its complexity motivated simpler alternatives like DPO.",
      "fundamentals": "RM loss: $\\mathcal{L} = -\\mathbb{E}[\\log \\sigma(r(x,y_w) - r(x,y_l))]$. PPO: $\\max \\mathbb{E}[r(x,y) - \\beta \\cdot \\text{KL}(\\pi_\\theta \\| \\pi_{\\text{ref}})]$. PPO clips: $\\mathcal{L} = -\\min(\\text{ratio} \\cdot \\hat{A},\\, \\text{clip}(\\text{ratio}, 1 \\pm \\varepsilon) \\cdot \\hat{A})$. $\\beta$ 0.01-0.2, $\\varepsilon$=0.2. Preference data: 50K-500K comparisons.",
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "related": [
        "dpo",
        "sft"
      ],
      "foundational_papers": [
        {
          "title": "Training language models to follow instructions with human feedback (InstructGPT)",
          "authors": "Ouyang et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2203.02155"
        },
        {
          "title": "Deep reinforcement learning from human preferences",
          "authors": "Christiano et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03741"
        },
        {
          "title": "Proximal Policy Optimization Algorithms",
          "authors": "Schulman et al.",
          "venue": "2017",
          "arxiv": "1707.06347"
        },
        {
          "title": "Learning to summarize from human feedback",
          "authors": "Stiennon et al.",
          "venue": "NeurIPS 2020",
          "arxiv": "2009.01325"
        },
        {
          "title": "Constitutional AI: Harmlessness from AI Feedback",
          "authors": "Bai et al.",
          "venue": "2022",
          "arxiv": "2212.08073"
        },
        {
          "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
          "authors": "Touvron et al.",
          "venue": "2023",
          "arxiv": "2307.09288"
        }
      ],
      "sources": [
        "Ouyang et al., arXiv:2203.02155"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Lilian Weng — Reward Hacking in RL",
          "url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/"
        },
        {
          "label": "HuggingFace Deep RL Course — RLHF",
          "url": "https://huggingface.co/learn/deep-rl-course/en/unitbonus3/rlhf"
        }
      ]
    },
    "rlvr": {
      "id": "rlvr",
      "name": "RLVR",
      "expansion": "Reinforcement Learning with Verifiable Rewards",
      "category": "training-pipeline",
      "oneliner": "RL training where rewards come from automated verification (math checker, code tests, format rules) rather than human labels or learned reward models.",
      "explanation": "Reinforcement Learning with Verifiable Rewards is a training paradigm where reward signals come from automatic, objectively checkable criteria rather than human labels or learned reward models. For math the verifier checks the answer against ground truth; for code it runs test cases. This eliminates reward hacking and annotation cost. RLVR powered DeepSeek R1-Zero, where only accuracy and format rewards were used, yet self-verification and backtracking emerged spontaneously.",
      "fundamentals": "Reward function: $r(x, y) = r_{\\text{accuracy}}(x, y) + r_{\\text{format}}(x, y)$. Accuracy reward is binary (1 if answer matches ground truth, 0 otherwise) or graded. Format reward enforces structural constraints (e.g., answer within \\texttt{\\textbackslash boxed\\{\\}}). No learned reward model — verification is deterministic and unhackable. Training data: question-answer pairs where answers are machine-checkable (math, code, logic puzzles). Scales to millions of problems via synthetic generation. Limitation: only works for domains with verifiable answers; open-ended generation still needs RLHF/DPO.",
      "related": [
        "rlhf",
        "grpo",
        "reasoning-models",
        "dpo"
      ],
      "seen_in": [
        "research-papers",
        "training-recipes"
      ],
      "foundational_papers": [
        {
          "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
          "authors": "DeepSeek-AI",
          "venue": "2025",
          "arxiv": "2501.12948"
        },
        {
          "title": "Reinforcement Learning with Verifiable Rewards Implicitly Incentivizes Correct Reasoning in Base LLMs",
          "authors": "Various",
          "venue": "2025",
          "arxiv": "2506.14245"
        }
      ],
      "resources": [
        {
          "label": "RLVR explained (Promptfoo)",
          "url": "https://www.promptfoo.dev/blog/rlvr-explained/"
        },
        {
          "label": "Reasoning Gym for RLVR (NeurIPS 2025)",
          "url": "https://neurips.cc/virtual/2025/poster/121745"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "rmsnorm": {
      "id": "rmsnorm",
      "name": "RMSNorm",
      "expansion": "Root Mean Square Layer Normalization",
      "category": "layer-types",
      "oneliner": "Simplified LayerNorm — normalizes by root mean square only, dropping mean subtraction. Same quality, fewer ops. Standard in all modern LLMs.",
      "explanation": "RMSNorm is a simplified version of Layer Normalization that drops the mean-centering step and normalizes activations by their root mean square alone. This saves a full reduction operation, making it faster than LayerNorm while producing equivalent training quality. It has only a learnable scale parameter with no bias. Virtually every major open-weight LLM since 2023 uses RMSNorm, including Llama, Mistral, Qwen, and Gemma.",
      "fundamentals": "$\\text{RMSNorm}(x) = \\gamma \\cdot \\frac{x}{\\text{RMS}(x)}$ where $\\text{RMS}(x) = \\sqrt{\\frac{1}{d}\\sum_i x_i^2 + \\varepsilon}$. vs LayerNorm: no $\\mu$ computation, no $(x-\\mu)$ subtraction, no $\\beta$ bias. Params: $d_{\\text{model}}$ (vs $2 \\times d_{\\text{model}}$ for LayerNorm). For LLaMA 7B: 65 RMSNorm instances $\\times$ 4096 = 266K params (<0.004% of model). Computational savings: 1 reduction op vs 2, ~2x fewer reductions (the GPU synchronization bottleneck).",
      "seen_in": [
        "model-config",
        "code"
      ],
      "related": [
        "layernorm",
        "pre-norm"
      ],
      "sources": [
        "Zhang & Sennrich, 'Root Mean Square Layer Normalization,' NeurIPS 2019, arXiv:1910.07467"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Root Mean Square Layer Normalization",
          "authors": "Zhang & Sennrich",
          "venue": "NeurIPS 2019",
          "arxiv": "1910.07467"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Raschka — LLM Architecture Gallery",
          "url": "https://sebastianraschka.com/llm-architecture-gallery/"
        }
      ]
    },
    "rope": {
      "id": "rope",
      "name": "RoPE",
      "expansion": "Rotary Position Embedding",
      "category": "position-encodings",
      "oneliner": "Encodes position by rotating Q and K vectors in pairs of dimensions — the dot product naturally depends only on relative position (m-n), not absolute.",
      "explanation": "Rotary Position Embedding encodes token positions by rotating query and key vectors in paired dimensions before the attention dot product. The resulting attention score depends only on the relative distance between tokens, not their absolute positions. Each dimension pair rotates at a different frequency, creating a multi-scale position signal with no learned parameters. RoPE is used by Llama, Mistral, Qwen, Gemma, and most other major open-weight LLMs.",
      "fundamentals": "Rotation per dimension pair: $[q'_{2i},\\, q'_{2i+1}] = R(m \\cdot \\theta_i) \\cdot [q_{2i},\\, q_{2i+1}]$ where $R$ is a $2 \\times 2$ rotation matrix. $\\theta_i = \\text{base}^{-2i/d}$. Complex form: $z'_i = z_i \\cdot e^{j \\cdot m \\cdot \\theta_i}$. Wavelengths: $2\\pi$ (i=0, high freq, local) to $2\\pi \\cdot \\text{base}$ (last pair, low freq, global). For base=10,000: max wavelength $\\approx$ 62,832 positions. Key property: $q^T(m) \\cdot k(n) = q^T(0) \\cdot R(n-m) \\cdot k(0)$ — depends only on relative offset. No learned params. Decaying inter-token signal acts as soft locality bias.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "abf",
        "alibi",
        "ntk-rope",
        "position-interpolation",
        "sinusoidal-pe",
        "token",
        "yarn"
      ],
      "sources": [
        "Su et al., 'RoFormer: Enhanced Transformer with Rotary Position Embedding,' 2021, arXiv:2104.09864"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "RoFormer: Enhanced Transformer with Rotary Position Embedding",
          "authors": "Su et al.",
          "venue": "2021",
          "arxiv": "2104.09864"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — The Transformer Family v2",
          "url": "https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/"
        }
      ]
    },
    "rouge": {
      "id": "rouge",
      "name": "ROUGE",
      "expansion": "ROUGE — Recall-Oriented Understudy for Gisting Evaluation",
      "category": "scaling-patterns",
      "oneliner": "A family of metrics measuring word-level overlap between generated text and reference text — the standard evaluation for summarization tasks.",
      "explanation": "ROUGE is a set of metrics that compare a generated summary to one or more reference summaries by measuring n-gram overlap. ROUGE-1 counts matching individual words, ROUGE-2 counts matching two-word phrases, and ROUGE-L measures the longest common subsequence. Scores range from 0 to 1, with higher indicating more overlap with the reference. ROUGE is the standard metric for summarization evaluation and appears in benchmarks and research papers.",
      "related": [
        "evals",
        "perplexity"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "rwkv": {
      "id": "rwkv",
      "name": "RWKV",
      "expansion": "RWKV (Receptance Weighted Key Value)",
      "category": "layer-types",
      "oneliner": "An RNN-based architecture that combines transformer-style parallelizable training with constant-memory recurrent inference — an alternative to both transformers and Mamba.",
      "explanation": "RWKV is an architecture that achieves transformer-quality language modeling while using a linear-complexity recurrence instead of quadratic attention. During training it can be parallelized like a transformer, but during inference it processes tokens one at a time with constant memory, like an RNN. This makes it particularly attractive for long-context and resource-constrained deployment. RWKV models up to 14B parameters are available on HuggingFace.",
      "related": [
        "mamba",
        "self-attention",
        "mha"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "RWKV: Reinventing RNNs for the Transformer Era",
          "authors": "Peng et al.",
          "venue": "EMNLP 2023",
          "arxiv": "2305.13048"
        }
      ],
      "resources": [
        {
          "label": "RWKV GitHub",
          "url": "https://github.com/BlinkDL/RWKV-LM"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "safetensors": {
      "id": "safetensors",
      "name": "safetensors",
      "expansion": "Safe Tensors serialization format",
      "category": "formats",
      "oneliner": "HuggingFace's secure, fast tensor serialization — replaces pickle-based .bin files. Immune to arbitrary code execution.",
      "explanation": "safetensors is a secure file format for storing model weights that prevents code execution attacks. The standard PyTorch serialization uses Python pickle, which can run arbitrary code when a file is loaded, creating a serious security risk. safetensors instead stores tensors as raw bytes with a simple JSON header, making it impossible to embed malicious code. It also supports memory mapping for faster loading and is now the default on HuggingFace Hub.",
      "fundamentals": "File structure: 8-byte header length, JSON header (tensor names, shapes, dtypes, offsets), raw tensor data. No nested structures, no code, no pickle. Can be memory-mapped for zero-copy loading. Validation: can verify file integrity without loading all data.",
      "seen_in": [
        "filename"
      ],
      "related": [
        "gguf"
      ],
      "sources": [
        "github.com/huggingface/safetensors"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "scaling-laws": {
      "id": "scaling-laws",
      "name": "Scaling Laws",
      "expansion": "Neural Scaling Laws (Kaplan / Chinchilla)",
      "category": "training-pipeline",
      "oneliner": "Empirical relationships governing how model performance improves with more parameters, data, and compute — the foundation of LLM training budgets.",
      "explanation": "Scaling laws are empirical formulas predicting how model loss decreases as you increase parameter count, training data, or compute. The Chinchilla study (2022) found that for a given compute budget, you should train on roughly 20 tokens per parameter — meaning many models were too large and undertrained. This shifted the field toward smaller models trained on more data, exemplified by Llama training 7B on over one trillion tokens rather than using those FLOPs on a larger model.",
      "fundamentals": "Kaplan scaling: $L(N) \\propto N^{-0.076}$ where $N$ = parameters, $L$ = loss. Chinchilla optimal: for compute budget $C \\approx 6ND$ FLOPs, the optimal allocation is $N \\propto C^{0.5}$ and $D \\propto C^{0.5}$, meaning params and tokens should scale equally. Practical rule: train on ~20 tokens per parameter. Over-training (more tokens than optimal) is common because inference cost scales with $N$ but not $D$.",
      "related": [
        "pre-training",
        "dense-models"
      ],
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "foundational_papers": [
        {
          "title": "Scaling Laws for Neural Language Models",
          "authors": "Kaplan et al.",
          "venue": "2020",
          "arxiv": "2001.08361"
        },
        {
          "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
          "authors": "Hoffmann et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2203.15556"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Raschka — State of LLMs 2025",
          "url": "https://magazine.sebastianraschka.com/p/state-of-llms-2025"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "scaling-test-time": {
      "id": "scaling-test-time",
      "name": "Test-Time Compute Scaling",
      "expansion": "Test-Time Compute Scaling (Inference-Time Scaling)",
      "category": "scaling-patterns",
      "oneliner": "Spending more computation at inference time on harder problems — generating longer reasoning chains or sampling multiple answers — rather than just using a bigger model.",
      "explanation": "Test-time compute scaling is the principle that you can improve model performance by spending more compute during inference rather than during training. Instead of always generating one quick answer, the model generates extended reasoning chains, samples multiple candidate solutions, or uses a verifier to select the best one. This shifts the scaling paradigm from bigger models to smarter inference.",
      "fundamentals": "Two axes: serial scaling (longer chains of thought within one generation, controlled by token budget) and parallel scaling (generating N candidates and selecting the best via majority vote or a reward model). Snell et al. showed that a smaller model with optimal test-time compute allocation can outperform a 14x larger model on hard problems. Compute-optimal strategy: easy problems get short answers, hard problems get extended reasoning. The difficulty estimator decides how much compute to allocate per problem.",
      "related": [
        "reasoning-models",
        "chain-of-thought",
        "thinking-tokens",
        "speculative-decoding"
      ],
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "foundational_papers": [
        {
          "title": "Scaling LLM Test-Time Compute Optimally Can be More Effective than Scaling Model Parameters",
          "authors": "Snell et al.",
          "venue": "ICLR 2025",
          "arxiv": "2408.03314"
        }
      ],
      "resources": [
        {
          "label": "Snell et al. paper",
          "url": "https://arxiv.org/abs/2408.03314"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "sdpa": {
      "id": "sdpa",
      "name": "SDPA",
      "expansion": "Scaled Dot-Product Attention (PyTorch API)",
      "category": "attention-variants",
      "oneliner": "PyTorch's fused attention primitive — auto-dispatches to Flash Attention, Memory-Efficient, or math fallback based on hardware and input constraints.",
      "explanation": "Scaled Dot-Product Attention in PyTorch 2.0 and later fuses the entire attention computation into a single optimized kernel call. Before this API, users had to implement attention manually or rely on external libraries. It automatically selects the best backend: Flash Attention when possible for maximum speed, memory-efficient attention for arbitrary masks, or a math fallback for CPU and unsupported data types. It is now the default attention implementation in HuggingFace Transformers.",
      "fundamentals": "API: F.scaled_dot_product_attention(query, key, value, attn_mask=None, is_causal=False, scale=None). Shapes: Q [batch, n_heads, seq_q, head_dim], K/V [batch, n_heads, seq_kv, head_dim]. Override backend: torch.backends.cuda.sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False). Performance on A100 seq_len=2048: Math=baseline, Memory-Efficient=2-3$\\times$ faster, Flash=3-5$\\times$ faster.",
      "seen_in": [
        "code",
        "model-config"
      ],
      "related": [
        "flash-attention",
        "mha"
      ],
      "sources": [
        "PyTorch 2.0 docs: torch.nn.functional.scaled_dot_product_attention",
        "Vaswani et al., arXiv:1706.03762 (original formula)"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "PyTorch SDPA documentation",
          "url": "https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html"
        }
      ]
    },
    "self-attention": {
      "id": "self-attention",
      "name": "Self-Attention",
      "expansion": "Self-Attention (Scaled Dot-Product Attention)",
      "category": "attention-variants",
      "oneliner": "The mechanism where every position in a sequence computes a weighted sum over all other positions to capture contextual relationships.",
      "explanation": "Self-attention lets each token in a sequence look at every other token to decide what context matters most. Each token is projected into a query, key, and value vector. Attention scores are computed as scaled dot products between queries and keys, then softmaxed into weights over the value vectors. This is the core mechanism inside transformers, giving them the ability to capture long-range dependencies in a single layer.",
      "fundamentals": "Given input $X \\in \\mathbb{R}^{n \\times d}$, self-attention computes $Q = XW^Q$, $K = XW^K$, $V = XW^V$ and returns $\\text{Attention}(Q,K,V) = \\text{softmax}\\left(\\frac{QK^\\top}{\\sqrt{d_k}}\\right)V$. Cost is $O(n^2 d)$ in time and $O(n^2)$ in memory.",
      "related": [
        "gqa",
        "mha",
        "mqa",
        "sdpa",
        "token"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Attention Is All You Need",
          "authors": "Vaswani et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03762"
        }
      ],
      "resources": [
        {
          "label": "The Illustrated Transformer",
          "url": "https://jalammar.github.io/illustrated-transformer/"
        },
        {
          "label": "3Blue1Brown — Attention in Transformers",
          "url": "https://www.3blue1brown.com/lessons/attention"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "seq2seq": {
      "id": "seq2seq",
      "name": "Sequence-to-Sequence",
      "expansion": "Sequence-to-Sequence (Encoder-Decoder)",
      "category": "layer-types",
      "oneliner": "An encoder-decoder architecture that reads an input sequence in full, then generates an output sequence token by token.",
      "explanation": "A sequence-to-sequence model uses an encoder to read an entire input sequence with bidirectional attention, then a decoder to generate the output one token at a time using the encoder's representations. This design fits tasks where input and output are different sequences, such as translation and summarization. T5 and BART are the best-known seq2seq transformers, though decoder-only models now dominate chat and general generation.",
      "fundamentals": "The encoder produces $H_{\\text{enc}} = \\text{Encoder}(x_1, \\ldots, x_n)$ using bidirectional attention. The decoder at step $t$ computes causal self-attention over $y_{<t}$, then cross-attention: $\\text{softmax}\\left(\\frac{Q_{\\text{dec}} K_{\\text{enc}}^\\top}{\\sqrt{d_k}}\\right) V_{\\text{enc}}$. Encoder KV cache computed once during prefill.",
      "related": [
        "causal-lm",
        "kv-cache",
        "mha",
        "self-attention",
        "token"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer",
          "authors": "Raffel et al.",
          "venue": "JMLR 2020",
          "arxiv": "1910.10683"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace T5 docs",
          "url": "https://huggingface.co/docs/transformers/model_doc/t5"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "sft": {
      "id": "sft",
      "name": "SFT",
      "expansion": "Supervised Fine-Tuning (Instruction Tuning)",
      "category": "training-pipeline",
      "oneliner": "Training on (instruction, response) pairs — loss computed only on response tokens, not the prompt. Quality >> quantity (LIMA: 1K examples can suffice).",
      "explanation": "SFT is a training stage that transforms a base language model into an instruction-following assistant by training it on prompt-response examples. The model learns to generate helpful responses by computing loss only on the response tokens while masking out the prompt. Typical datasets range from 10K to 1M examples drawn from collections like FLAN, Alpaca, ShareGPT, and UltraChat. SFT sits between pre-training and alignment at a small fraction of the pre-training cost.",
      "fundamentals": "Loss: $\\mathcal{L} = -\\frac{1}{|y|}\\sum_t \\log p_\\theta(y_t | x, y_1 \\ldots y_{t-1})$. Prompt $x$ tokens masked. Data: {messages: [{role:user, content:...}, {role:assistant, content:...}]}. Hyperparams: lr 1e-5 to 2e-5, 1-5 epochs. Often uses LoRA/QLoRA.",
      "seen_in": [
        "model-name",
        "model-cards"
      ],
      "related": [
        "dpo",
        "lora",
        "pre-training",
        "qlora",
        "rlhf"
      ],
      "foundational_papers": [
        {
          "title": "Training language models to follow instructions with human feedback (InstructGPT)",
          "authors": "Ouyang et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2203.02155"
        },
        {
          "title": "Scaling Instruction-Finetuned Language Models (FLAN v2)",
          "authors": "Chung et al.",
          "venue": "2022",
          "arxiv": "2210.11416"
        },
        {
          "title": "LIMA: Less Is More for Alignment",
          "authors": "Zhou et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2305.11206"
        },
        {
          "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
          "authors": "Wang et al.",
          "venue": "ACL 2023",
          "arxiv": "2212.10560"
        }
      ],
      "sources": [
        "Ouyang et al., arXiv:2203.02155"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Cameron Wolfe — Understanding SFT",
          "url": "https://cameronrwolfe.substack.com/p/understanding-and-using-supervised"
        },
        {
          "label": "HuggingFace TRL docs",
          "url": "https://huggingface.co/docs/trl/en/index"
        }
      ]
    },
    "simpo": {
      "id": "simpo",
      "name": "SimPO",
      "expansion": "Simple Preference Optimization",
      "category": "training-pipeline",
      "oneliner": "Reference-free DPO: length-normalized log-prob as implicit reward + target margin $\\gamma$. Halves memory vs DPO.",
      "explanation": "Simple Preference Optimization is a reference-free variant of DPO that removes the need to keep a frozen reference model in memory, cutting GPU memory roughly in half during alignment training. It uses the average log-probability per token as an implicit reward, with a margin term requiring chosen responses to score above rejected ones by a minimum gap. SimPO matches or beats DPO on standard benchmarks while being simpler to implement.",
      "fundamentals": "L = -E[log $\\sigma$($\\beta$·avg_logp(y_w|x) - $\\beta$·avg_logp(y_l|x) - $\\gamma$)]. avg_logp = (1/|y|)$\\Sigma$ log $\\pi_\\theta(y_t|x,y_<t)$. No $\\pi_{\\text{ref}}$. $\\beta$ 2.0-2.5, $\\gamma$ 0.5-1.5.",
      "seen_in": [
        "model-name",
        "model-cards"
      ],
      "related": [
        "dpo",
        "orpo",
        "token"
      ],
      "foundational_papers": [
        {
          "title": "SimPO: Simple Preference Optimization with a Reference-Free Reward",
          "authors": "Meng et al.",
          "venue": "NeurIPS 2024",
          "arxiv": "2405.14734"
        }
      ],
      "sources": [
        "Meng et al., arXiv:2405.14734"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "sinusoidal-pe": {
      "id": "sinusoidal-pe",
      "name": "Sinusoidal Position Encoding",
      "expansion": "Sinusoidal Position Encoding",
      "category": "position-encodings",
      "oneliner": "The original Transformer position encoding: sin/cos at geometrically spaced frequencies, added to token embeddings. Mostly replaced by RoPE.",
      "explanation": "Sinusoidal positional encoding, from the original Transformer paper, gives the model a sense of token order by adding a fixed vector of sine and cosine values at geometrically spaced frequencies to each token embedding. It has no learned parameters and was designed so that relative positions could be represented as linear transformations. Modern LLMs have largely replaced it with RoPE, which better preserves positional information across layers.",
      "fundamentals": "PE(pos, 2i) = sin(pos/$10000^{2i/d}$). PE(pos, 2i+1) = cos(pos/$10000^{2i/d}$). Frequencies: $\\omega_k = 1/10000^{2k/d}$, geometric from 1 to 1/10000. Added to token embedding: $x'(pos) = x(pos) + PE(pos)$. Properties: $\\|PE(pos)\\|^2 = d/2$ (bounded, constant). $PE(pos+k) = M_k \\cdot PE(pos)$ where $M_k$ is a block-diagonal rotation matrix — relative position as linear transform. 0 learned params.",
      "seen_in": [
        "legacy-models",
        "documentation"
      ],
      "related": [
        "alibi",
        "learned-pe",
        "rope",
        "token"
      ],
      "sources": [
        "Vaswani et al., 'Attention Is All You Need,' NeurIPS 2017, arXiv:1706.03762"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Attention Is All You Need",
          "authors": "Vaswani et al.",
          "venue": "NeurIPS 2017",
          "arxiv": "1706.03762"
        }
      ]
    },
    "size-A": {
      "id": "size-A",
      "name": "A (Active params)",
      "expansion": "Active parameters per token in MoE",
      "category": "model-naming",
      "oneliner": "Active parameters per forward pass in MoE. Qwen2.5-A14B = 57B total, 14B active. Gemma 26B-A4B = 25.2B total, 3.8B active.",
      "explanation": "A marks the active parameter count in a Mixture-of-Experts model, which is the number of parameters that actually compute for each token. For example, Qwen2.5-MoE-A2.7B-Instruct has 14.3B total parameters but only 2.7 billion fire per token because the router selects a small subset of experts. This distinction is critical for estimating inference speed and GPU cost: all parameters must fit in memory, but latency and FLOPs scale with the active count.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "inference",
        "moe",
        "size-B",
        "size-E",
        "token"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Qwen2.5-MoE model card (A14B notation)",
          "url": "https://huggingface.co/Qwen/Qwen2.5-72B-A14B"
        }
      ]
    },
    "size-B": {
      "id": "size-B",
      "name": "B (Billion)",
      "expansion": "Billion parameters",
      "category": "model-naming",
      "oneliner": "The universal size marker. 7B = 7 billion parameters. Refers to total params unless combined with A (active).",
      "explanation": "When you see a number followed by B in a model name, it means billion parameters. Llama-3.1-8B has eight billion, Qwen2.5-72B has seventy-two billion. The convention is nearly universal, though some older repos use lowercase (7b instead of 7B). If the name shows a single number with no active/total distinction, assume the model is dense and every parameter fires on every token.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "dense-models",
        "size-A",
        "size-E",
        "token"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Llama 3.1 model card (size convention example)",
          "url": "https://huggingface.co/meta-llama/Llama-3.1-8B"
        }
      ]
    },
    "size-E": {
      "id": "size-E",
      "name": "E (Effective params)",
      "expansion": "Effective parameters (Gemma-specific)",
      "category": "model-naming",
      "oneliner": "Google Gemma notation: effective capacity accounting for architectural efficiency (PLE, parameter sharing). E4B $\\approx$ performs like 4B dense.",
      "explanation": "E stands for effective parameters, a convention specific to Google's Gemma 3n and Gemma 4 series. The Gemma-3n-E4B model has roughly eight billion raw parameters, but because it uses Per-Layer Embedding sharing, many of those weights are reused, giving it the compute profile of about 4.5 billion parameters. Google introduced the E tag so users can compare real-world performance expectations without being misled by the inflated raw count.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "ple",
        "size-A",
        "size-B"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Gemma 3 Technical Report (E2B/E4B notation)",
          "url": "https://arxiv.org/abs/2503.19786"
        }
      ]
    },
    "size-T": {
      "id": "size-T",
      "name": "T (Trillion)",
      "expansion": "Trillion parameters or tokens",
      "category": "model-naming",
      "oneliner": "Trillions. Rare for params (Switch Transformer 1.6T). More common for training data ('trained on 15T tokens'). Context matters.",
      "explanation": "T means trillion parameters when it appears as a size marker, but context matters because the same letter shows up in training-data descriptions where 1T means one trillion tokens of training text, not model size. Very few public-weight models exceed one trillion parameters. If you see 1T in a HuggingFace repo title, double-check whether it refers to the model or the pre-training corpus volume.",
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "related": [
        "pre-training",
        "size-B"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Switch Transformer paper (1.6T params)",
          "url": "https://arxiv.org/abs/2101.03961"
        }
      ]
    },
    "size-context": {
      "id": "size-context",
      "name": "K / M (Context length)",
      "expansion": "Thousands / Millions of tokens context",
      "category": "model-naming",
      "oneliner": "128K = 128,000 token context. 1M = 1 million. Distinguishes long-context variants from defaults.",
      "explanation": "A number like 1M, 128K, or 4K in the model name indicates the maximum context window in tokens. Qwen2.5-7B-Instruct-1M supports up to one million tokens, while Llama-3.1 variants advertise 128K in their model cards even though it rarely appears in the repo name. Not every model puts this in the title, so you may need to check config.json for max_position_embeddings.",
      "seen_in": [
        "model-name",
        "model-cards"
      ],
      "related": [
        "rope",
        "yarn",
        "abf"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Llama 3.1 paper (128K context)",
          "url": "https://arxiv.org/abs/2407.21783"
        }
      ]
    },
    "size-x": {
      "id": "size-x",
      "name": "x (MoE expert notation)",
      "expansion": "Experts × expert size (e.g., 8x7B)",
      "category": "model-naming",
      "oneliner": "Mixtral-style: 8x7B = 8 experts of ~7B each. Total ≠ 8$\\times$7B due to shared attention layers. Active = top-k experts worth.",
      "explanation": "The NxSB pattern popularized by Mixtral describes Mixture-of-Experts model sizes, where N is the number of experts and S is the size of each expert. For example, 8x7B means 8 experts of roughly 7 billion parameters each, but the total parameter count is about 46.7 billion rather than 56 billion because the attention layers are shared across all experts. The active parameter count per token is much smaller, around 12.9 billion for Mixtral since only the top 2 experts are selected for each token.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "expert-routing",
        "moe",
        "token"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Mixtral of Experts paper (8x7B notation)",
          "url": "https://arxiv.org/abs/2401.04088"
        }
      ]
    },
    "smoothquant": {
      "id": "smoothquant",
      "name": "SmoothQuant",
      "expansion": "SmoothQuant — Activation-Aware Weight Smoothing for W8A8",
      "category": "quantization-methods",
      "oneliner": "A technique that migrates the quantization difficulty from hard-to-quantize activations to easy-to-quantize weights, enabling practical W8A8 inference.",
      "explanation": "SmoothQuant makes W8A8 quantization practical by mathematically smoothing activation outliers before quantization. LLM activations have extreme values in certain channels that make direct INT8 quantization fail. SmoothQuant applies per-channel scaling that divides activations by a smoothing factor and multiplies corresponding weights by the same factor. This is mathematically equivalent but redistributes the difficulty from activations to weights, enabling both to be quantized to INT8.",
      "fundamentals": "For each channel $j$: smooth factor $s_j = \\max(|X_j|)^\\alpha / \\max(|W_j|)^{(1-\\alpha)}$ where $\\alpha \\in [0,1]$ controls the migration strength (typically 0.5). Smoothed: $\\hat{X}_j = X_j / s_j$, $\\hat{W}_j = W_j \\cdot s_j$. Output unchanged: $Y = XW = (X/s)(sW) = \\hat{X}\\hat{W}$. The smoothing factors are computed once from calibration data and folded into preceding LayerNorm parameters.",
      "related": [
        "w8a8",
        "int8",
        "quantization",
        "calibration-data",
        "awq"
      ],
      "seen_in": [
        "documentation",
        "serving-config"
      ],
      "foundational_papers": [
        {
          "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models",
          "authors": "Xiao et al.",
          "venue": "ICML 2023",
          "arxiv": "2211.10438"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "speculative-decoding": {
      "id": "speculative-decoding",
      "name": "Speculative Decoding",
      "expansion": "Speculative Decoding / Speculative Sampling",
      "category": "scaling-patterns",
      "oneliner": "Small draft model proposes K candidate tokens, large model verifies all K in one forward pass. 2-3$\\times$ speedup with provably identical output distribution.",
      "explanation": "Speculative decoding speeds up LLM generation by having a small draft model rapidly propose several candidate tokens, then letting the large target model verify them all in a single forward pass. This works because processing one token or several in parallel costs nearly the same on modern hardware. Accepted tokens provably match the target model's output distribution, so quality is identical. Typical speedups are two to three times.",
      "fundamentals": "Verification per position i: accept draft token x with prob min(1, p(x)/q(x)) where p=target, q=draft. If rejected: sample from p'(x) = max(0, p(x)-q(x)) / Z. Expected tokens per step: (1-$\\alpha$^(K+1))/(1-$\\alpha$) where $\\alpha$=acceptance rate. For K=5, $\\alpha$=0.8: ~3.36 tokens/step → ~2.7$\\times$ speedup. Draft model can be: smaller same-family model, separate small model, or even n-gram model. Self-speculative: use target model with early exit for drafting.",
      "seen_in": [
        "serving-config",
        "documentation"
      ],
      "related": [
        "continuous-batching",
        "kv-cache",
        "token"
      ],
      "sources": [
        "Leviathan et al., 'Fast Inference from Transformers via Speculative Decoding,' ICML 2023, arXiv:2211.17192",
        "Chen et al., 'Accelerating LLM Decoding with Speculative Sampling,' arXiv:2302.01318"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Fast Inference from Transformers via Speculative Decoding",
          "authors": "Leviathan et al.",
          "venue": "ICML 2023",
          "arxiv": "2211.17192"
        },
        {
          "title": "Accelerating Large Language Model Decoding with Speculative Sampling",
          "authors": "Chen et al.",
          "venue": "2023",
          "arxiv": "2302.01318"
        }
      ],
      "resources": [
        {
          "label": "Lilian Weng — LLM Inference Optimization",
          "url": "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"
        }
      ]
    },
    "spinquant": {
      "id": "spinquant",
      "name": "SpinQuant",
      "expansion": "Rotation-based Quantization-Aware Training",
      "category": "quantization-methods",
      "oneliner": "Learns optimal rotation matrices during training so that rotated weights are easier to quantize — a QAT approach using geometric transformations.",
      "explanation": "SpinQuant is a quantization method that learns optimal rotation matrices to transform model weights into a form that is easier to compress with minimal quality loss. It shares the insight that rotated weights are more uniform and thus more quantization-friendly, but instead of using random rotations it optimizes them during a fine-tuning phase. This combines the geometric benefits of incoherence processing with training-time adaptation. Published by Meta researchers.",
      "fundamentals": "During fine-tuning: insert learnable rotation matrices R before quantization in each layer. Forward pass: W' = R @ W, then quantize W'. Gradients flow through to both W and R via STE. The rotations R converge to transformations that spread weight information evenly, minimizing quantization error.",
      "seen_in": [
        "research-papers",
        "model-cards"
      ],
      "related": [
        "qat",
        "quantization",
        "quip-sharp"
      ],
      "sources": [
        "Liu et al., 'SpinQuant,' arXiv:2405.16406"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "SpinQuant: LLM Quantization with Learned Rotations",
          "authors": "Liu et al.",
          "venue": "2024",
          "arxiv": "2405.16406"
        }
      ]
    },
    "structured-output": {
      "id": "structured-output",
      "name": "Structured Output",
      "expansion": "Structured Output / Constrained Decoding",
      "category": "sampling-decoding",
      "oneliner": "Forcing an LLM to generate output that conforms to a specific format like JSON, XML, or a grammar by masking invalid tokens during generation.",
      "explanation": "Structured output techniques force a language model to produce responses that conform to a defined schema or grammar. The most common approach, constrained decoding, works by masking out tokens at each generation step that would violate the target format. For JSON output, only tokens that continue a valid JSON structure are allowed. OpenAI, Anthropic, and open-source tools like Outlines and SGLang support this natively.",
      "related": [
        "function-calling",
        "pydantic-ai",
        "guardrails"
      ],
      "seen_in": [
        "code",
        "documentation",
        "serving-config"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Outlines library",
          "url": "https://github.com/dottxt-ai/outlines"
        },
        {
          "label": "OpenAI structured outputs",
          "url": "https://platform.openai.com/docs/guides/structured-outputs"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "swa": {
      "id": "swa",
      "name": "SWA",
      "expansion": "Sliding Window Attention",
      "category": "attention-variants",
      "oneliner": "Each token attends only to a fixed-size local window of W preceding tokens — reduces quadratic cost to linear, with global reach through layer stacking.",
      "explanation": "Sliding Window Attention restricts each token to attending only to a fixed window of nearby tokens instead of the entire sequence. This reduces attention cost from quadratic to linear and bounds KV cache memory regardless of sequence length. Information still propagates globally through stacked layers, since each layer expands the effective receptive field. Models like Gemma 2 alternate sliding window layers with full-attention layers for a hybrid approach.",
      "fundamentals": "Token at position i attends to [max(0, i-W+1), ..., i] instead of [0, ..., i]. Attention mask: band matrix, not lower-triangular. Memory for attention scores: $O(n$\\times$W)$ vs $O($n^2$)$. Rolling KV cache: fixed buffer of W entries, oldest overwritten. For Mistral 7B (8 KV heads, head_dim=128, W=4096): cache per layer = 2 $\\times$ 8 $\\times$ 128 $\\times$ 4096 $\\times$ 2 = 16 MB; 32 layers = 512 MB constant regardless of generation length. Hybrid approach: even layers = SWA (W=4096), odd layers = full attention.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "attention-mask",
        "flash-attention",
        "gqa",
        "kv-cache",
        "mha",
        "token"
      ],
      "sources": [
        "Jiang et al., 'Mistral 7B,' 2023, arXiv:2310.06825",
        "Beltagy et al., 'Longformer,' 2020, arXiv:2004.05150"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Mistral 7B",
          "authors": "Jiang et al.",
          "venue": "2023",
          "arxiv": "2310.06825"
        },
        {
          "title": "Longformer: The Long-Document Transformer",
          "authors": "Beltagy et al.",
          "venue": "2020",
          "arxiv": "2004.05150"
        }
      ]
    },
    "swiglu": {
      "id": "swiglu",
      "name": "SwiGLU",
      "expansion": "Swish-activated Gated Linear Unit",
      "category": "layer-types",
      "oneliner": "Element-wise product of a Swish-activated gate projection and a value projection — the de facto standard FFN activation in modern LLMs.",
      "explanation": "SwiGLU is a gated activation function used in the feed-forward blocks of modern transformers, replacing ReLU. It splits the computation into two parallel linear projections: one passed through the Swish activation to act as a gate, and another providing the values. These are multiplied element-wise before a final down-projection. SwiGLU achieves lower perplexity than ReLU at the same compute cost and is standard in Llama, Mistral, and Qwen.",
      "fundamentals": "SwiGLU(x) = (Swish(xW_gate) ⊙ xW_up)W_down. Swish(z) = z·$\\sigma$(z). For LLaMA 7B: W_gate [4096, 11008], W_up [4096, 11008], W_down [11008, 4096]. Params: 3$\\times$4096$\\times$11008 $\\approx$ 135M $\\approx$ 8$d^2$. In HuggingFace config: hidden_act='silu' (SiLU is the activation within the GLU structure). Code pattern: act_fn(gate_proj(x)) * up_proj(x) then down_proj(...).",
      "seen_in": [
        "model-config",
        "code"
      ],
      "related": [
        "ffn",
        "geglu"
      ],
      "sources": [
        "Shazeer, 'GLU Variants Improve Transformer,' 2020, arXiv:2002.05202"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "GLU Variants Improve Transformer",
          "authors": "Shazeer",
          "venue": "2020",
          "arxiv": "2002.05202"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Raschka — LLM Architecture Gallery",
          "url": "https://sebastianraschka.com/llm-architecture-gallery/"
        }
      ]
    },
    "synthetic-data": {
      "id": "synthetic-data",
      "name": "Synthetic Data",
      "expansion": "Synthetic Training Data",
      "category": "datasets-recipes",
      "oneliner": "Training data generated by AI models rather than collected from humans — used to scale instruction datasets, simulate scenarios, and augment scarce domain data.",
      "explanation": "Synthetic data is training data produced by language models rather than curated from human sources. An LLM generates instruction-response pairs, reasoning traces, or domain-specific examples that are then used to train or fine-tune other models. The Phi series demonstrated that models trained primarily on synthetic textbook-quality data can punch far above their size.",
      "related": [
        "dataset-evolinstruct",
        "dataset-orca",
        "knowledge-distillation",
        "sft"
      ],
      "seen_in": [
        "documentation",
        "model-cards"
      ],
      "foundational_papers": [
        {
          "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
          "authors": "Wang et al.",
          "venue": "ACL 2023",
          "arxiv": "2212.10560"
        }
      ],
      "resources": [],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "system-prompt": {
      "id": "system-prompt",
      "name": "System Prompt",
      "expansion": "System Prompt (System Message)",
      "category": "prompting",
      "oneliner": "A hidden instruction layer that defines the LLM's persona, behavior rules, and constraints before the user's message is processed.",
      "explanation": "The system prompt is a special message at the beginning of a conversation that sets the model's behavior, persona, and constraints. It is typically hidden from the user but visible to the model. System prompts define tone, output format, safety rules, and domain expertise. They are the primary customization mechanism for LLM applications — the same base model can behave as a coding assistant, medical advisor, or creative writer depending on its system prompt.",
      "related": [
        "prompt-engineering",
        "context-engineering",
        "prompt-injection",
        "prefix-caching"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "OpenAI system message guide",
          "url": "https://platform.openai.com/docs/guides/text?api-mode=chat"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tag-alignment": {
      "id": "tag-alignment",
      "name": "-sft / -dpo / -rlhf / -orpo / -kto",
      "expansion": "Specific alignment method used",
      "category": "model-naming",
      "oneliner": "Signals which training stage/method produced the model. -sft = supervised only. -dpo = DPO-aligned. Often released in pairs for comparison.",
      "explanation": "Alignment-method suffixes like -sft, -dpo, -rlhf, -kto, or -orpo tell you exactly which post-training recipe was applied. Researchers publish these to enable direct comparisons: for instance, zephyr-7b-sft-full used only supervised fine-tuning while zephyr-7b-beta added a round of DPO on top. If you are benchmarking alignment approaches, these tags let you isolate the effect of each training phase.",
      "seen_in": [
        "model-name",
        "model-cards"
      ],
      "related": [
        "sft",
        "dpo",
        "rlhf",
        "orpo",
        "kto"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "DPO paper (most common alignment tag)",
          "url": "https://arxiv.org/abs/2305.18290"
        }
      ]
    },
    "tag-base": {
      "id": "tag-base",
      "name": "-base / -raw / -pretrain / -pt",
      "expansion": "Base pretrained model",
      "category": "model-naming",
      "oneliner": "No instruction tuning. Next-token prediction only. The starting point for fine-tuning. Google uses -pt, others use -base or no suffix.",
      "explanation": "A base model (also labeled -raw, -pretrain, or -pt) is the pretrained checkpoint before any instruction tuning or alignment. It predicts the next token given a prompt but has no built-in chat behavior, so it will continue your text rather than answer questions. Meta's Llama 3.1 uses no suffix for the base variant and adds -Instruct for the tuned one, while Google Gemma marks the base with -pt. If you are fine-tuning, start from the base.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "sft",
        "tag-instruct",
        "token"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "HuggingFace model hub (base vs instruct naming)",
          "url": "https://huggingface.co/models"
        }
      ]
    },
    "tag-coder": {
      "id": "tag-coder",
      "name": "-Coder / -Code",
      "expansion": "Code-optimized model",
      "category": "model-naming",
      "oneliner": "CPT or fine-tuned on code corpora. Distinct models, not just prompting. Qwen2.5-Coder, DeepSeek-Coder, CodeLlama.",
      "explanation": "A -Coder or Code- tag means the model was specialized for programming tasks, usually through continued pre-training on hundreds of billions of code tokens plus code-specific instruction tuning. CodeLlama prefixes it (CodeLlama-7b), while Qwen suffixes it (Qwen2.5-Coder-7B). These models score much higher on HumanEval and MBPP benchmarks than their general-purpose siblings, but may underperform on pure-prose tasks.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "cpt",
        "pre-training"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Code Llama paper",
          "url": "https://arxiv.org/abs/2308.12950"
        }
      ]
    },
    "tag-embed": {
      "id": "tag-embed",
      "name": "-Embed / -Embedding",
      "expansion": "Embedding model",
      "category": "model-naming",
      "oneliner": "Produces fixed-size vectors for retrieval/similarity, not generative text. Often encoder-only or uses mean pooling over decoder.",
      "explanation": "An embedding model outputs a single fixed-length vector for each input rather than generating text. These vectors are used for retrieval-augmented generation, semantic search, clustering, and similarity scoring. Unlike generative models, embedding models are loaded differently and called without a generate loop. Check the model card for the recommended prompt prefix and output dimension, as these vary across families.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "tag-mrl"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "MTEB Leaderboard (embedding model benchmarks)",
          "url": "https://huggingface.co/spaces/mteb/leaderboard"
        }
      ]
    },
    "tag-ft": {
      "id": "tag-ft",
      "name": "-ft / -full-ft / -fft / -lora / -qlora / -adapter",
      "expansion": "Fine-tuning method tags",
      "category": "model-naming",
      "oneliner": "-ft: generic fine-tuned. -full-ft/-fft: all params updated. -lora/-qlora: adapter type. -adapter: generic (usually bottleneck). Check model card for details.",
      "explanation": "Fine-tuning suffixes indicate how a model was adapted. -ft is generic and could mean any method. -lora means Low-Rank Adaptation was applied, with adapter files typically 10-100 MB. -full-ft means every parameter was updated, producing a full model copy. -qlora means quantized LoRA training was used. Check for adapter_config.json to confirm a LoRA-based fine-tune versus full-weight files.",
      "seen_in": [
        "model-name",
        "filename"
      ],
      "related": [
        "lora",
        "qlora",
        "full-ft",
        "adapters"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "HuggingFace PEFT docs",
          "url": "https://huggingface.co/docs/peft"
        }
      ]
    },
    "tag-guard": {
      "id": "tag-guard",
      "name": "-Guard / -Shield",
      "expansion": "Safety classifier model",
      "category": "model-naming",
      "oneliner": "Detects harmful content, prompt injections, policy violations. Outputs safe/unsafe classification, not generative text. Llama-Guard-3.",
      "explanation": "Guard models like Llama-Guard-3-8B are classifiers trained to flag unsafe content in prompts or model responses. They run as a filter alongside your main generative model, receiving a prompt-response pair and returning a safety label. Deploy one before or after generation to enforce content policies without modifying the main model. Guard models are small enough to run alongside the generator with minimal latency overhead.",
      "seen_in": [
        "model-name"
      ],
      "related": [],
      "foundational_papers": [
        {
          "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations",
          "authors": "Inan et al.",
          "venue": "2023",
          "arxiv": "2312.06674"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "tag-hf": {
      "id": "tag-hf",
      "name": "-hf",
      "expansion": "HuggingFace-compatible format",
      "category": "model-naming",
      "oneliner": "Weights converted to HuggingFace transformers format (safetensors + config.json). Does NOT mean 'hosted on HF'. Without -hf may need custom loading.",
      "explanation": "The -hf suffix means the checkpoint has been converted to HuggingFace Transformers format with standard PyTorch weight files and a config.json. Early Llama 2 releases shipped in Meta's internal format, so the community created -hf conversions. Starting with Llama 3 and most 2024-era releases, models ship HF-native, so the suffix is disappearing. If you see a repo without -hf, check whether it contains a config.json.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "safetensors"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "HuggingFace Transformers docs (model loading)",
          "url": "https://huggingface.co/docs/transformers/main/en/model_doc/auto"
        }
      ]
    },
    "tag-instruct": {
      "id": "tag-instruct",
      "name": "-it / -instruct / -chat",
      "expansion": "Instruction-tuned / chat-tuned",
      "category": "model-naming",
      "oneliner": "-it (Google Gemma), -Instruct (Meta Llama, Mistral), -Chat (Qwen 1/2). Same concept, different branding. Designed for instruction-following.",
      "explanation": "The -Instruct suffix (and its synonyms -Chat, -it) means the model has been fine-tuned with supervised instruction-following data and typically aligned with RLHF or DPO. Qwen shifted branding from -Chat to -Instruct starting with Qwen2.5, but the underlying training recipe is similar. There is no meaningful technical difference between -instruct and -chat across families; it is purely a naming preference.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "sft",
        "dpo",
        "rlhf"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Llama 3.1 Instruct model card",
          "url": "https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct"
        }
      ]
    },
    "tag-long": {
      "id": "tag-long",
      "name": "-Long",
      "expansion": "Extended context window variant",
      "category": "model-naming",
      "oneliner": "Trained or fine-tuned for longer sequences. Some use -Long suffix, others embed context length in name (128K, 1M).",
      "explanation": "A -Long or context-length tag means the model's context window has been extended beyond its base training length, typically using RoPE scaling techniques like YaRN, NTK-aware interpolation, or Adjusted Base Frequency. These models are usually continued-trained on long documents after the base RoPE modification. Performance may degrade at the extreme ends of the claimed range, so check the model card for validated effective lengths.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "yarn",
        "abf",
        "rope"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "YaRN paper (context extension)",
          "url": "https://arxiv.org/abs/2309.00071"
        }
      ]
    },
    "tag-math": {
      "id": "tag-math",
      "name": "-Math",
      "expansion": "Math-optimized model",
      "category": "model-naming",
      "oneliner": "Specialized for mathematical reasoning via training data and reward signals. Qwen2.5-Math, DeepSeek-Math.",
      "explanation": "The -Math suffix signals a model fine-tuned specifically for mathematical reasoning. Qwen2.5-Math-7B and DeepSeek-Math-7B are evaluated on GSM8K, MATH, and competition-level benchmarks. Some math models employ tool-augmented reasoning, invoking a code interpreter to handle arithmetic rather than relying on the LLM's native calculation ability. If your use case is general chat, a math-tagged model may be less helpful.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "cpt",
        "sft"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Qwen2.5-Math model card",
          "url": "https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct"
        }
      ]
    },
    "tag-merged": {
      "id": "tag-merged",
      "name": "-merged / -unmerged",
      "expansion": "Adapter merge state",
      "category": "model-naming",
      "oneliner": "-merged: LoRA folded into base weights (standalone, no overhead, irreversible). -unmerged: adapter separate (can hot-swap, needs PEFT loading).",
      "explanation": "A merged adapter model has the LoRA weight deltas baked into the base model weights, producing full-size checkpoint files that load like any standard model. An unmerged model ships the adapter files separately alongside a pointer to the base model, which saves storage but requires PEFT-aware loading code. If you see -merged in the name, expect larger files but simpler deployment.",
      "seen_in": [
        "model-name",
        "filename"
      ],
      "related": [
        "lora",
        "peft",
        "adapters"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "HuggingFace PEFT docs (merging adapters)",
          "url": "https://huggingface.co/docs/peft/main/en/developer_guides/lora#merge-lora-weights-into-the-base-model"
        }
      ]
    },
    "tag-moe-suffix": {
      "id": "tag-moe-suffix",
      "name": "-MOE / -MoE",
      "expansion": "Mixture-of-Experts architecture",
      "category": "model-naming",
      "oneliner": "Explicit MoE tag. Qwen2.5-MoE-A14B, DBRX (MoE). Mixtral uses 8x7B notation instead.",
      "explanation": "Some model families explicitly tag themselves with -MoE in the name (Qwen2.5-MoE-A2.7B), while others encode the mixture structure in the expert notation instead (Mixtral-8x7B). If neither appears in the name, check config.json for num_local_experts: a value greater than one confirms a MoE architecture. This matters because MoE models require more total memory than their active parameter count suggests.",
      "seen_in": [
        "model-name",
        "model-config"
      ],
      "related": [
        "moe"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Mixtral paper",
          "url": "https://arxiv.org/abs/2401.04088"
        }
      ]
    },
    "tag-mrl": {
      "id": "tag-mrl",
      "name": "-MRL",
      "expansion": "Matryoshka Representation Learning",
      "category": "model-naming",
      "oneliner": "Embedding model supporting variable-dimensionality outputs — truncate to smaller sizes with minimal quality loss. Named after Russian nesting dolls.",
      "explanation": "MRL stands for Matryoshka Representation Learning, a training technique that makes an embedding model useful at multiple output dimensions. You can truncate its full-dimension output to a smaller size and still get competitive retrieval quality. This lets you trade storage and computation cost against accuracy at inference time without retraining. When you see MRL in a model name or card, it means you can freely choose your embedding dimension.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "inference",
        "tag-embed"
      ],
      "foundational_papers": [
        {
          "title": "Matryoshka Representation Learning",
          "authors": "Kusupati et al.",
          "venue": "NeurIPS 2022",
          "arxiv": "2205.13147"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "tag-reward": {
      "id": "tag-reward",
      "name": "-Reward / -RM",
      "expansion": "Reward model",
      "category": "model-naming",
      "oneliner": "Outputs scalar preference scores for (prompt, response) pairs. Used in RLHF pipelines or best-of-N reranking. Not generative.",
      "explanation": "A reward model takes a prompt-response pair and outputs a scalar score indicating quality. Starling-RM-34B and tulu-v3.1-8b-rl-rm are examples. Reward models are a critical component of the RLHF pipeline and are also used standalone for best-of-N reranking at inference time. These are classifier-like models, not generators, so you call them differently from chat models.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "inference",
        "rlhf"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "InstructGPT paper (reward model training)",
          "url": "https://arxiv.org/abs/2203.02155"
        }
      ]
    },
    "tag-scale": {
      "id": "tag-scale",
      "name": "-Nano/-Micro/-Mini/-Small/-Tiny/-Lite/-Pro/-Turbo/-Large/-Max/-Ultra",
      "expansion": "Relative scale/quality tiers within a model family",
      "category": "model-naming",
      "oneliner": "NOT standardized across families. Mistral 'Small' = 24B. Phi 'Mini' = 3.8B. Always check actual param count. -Turbo = speed-optimized (OpenAI convention).",
      "explanation": "Scale-tier tags like -Lite, -Mini, -Small, -Pro, -Max, and -Ultra indicate relative positioning within a model family rather than absolute parameter counts. Mistral Small is 24B parameters. Phi-3 uses Mini for its 3.8B model. -Turbo typically signals speed optimization. -Pro, -Max, and -Ultra are more common in API product naming than in open-weight HuggingFace repos. Always check the actual parameter count.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "knowledge-distillation"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Mistral model family page",
          "url": "https://huggingface.co/mistralai"
        }
      ]
    },
    "tag-scout-maverick": {
      "id": "tag-scout-maverick",
      "name": "Scout / Maverick (Llama 4)",
      "expansion": "Llama 4 MoE scale variants",
      "category": "model-naming",
      "oneliner": "Scout: 17B active, 16 experts (109B total), 10M context. Maverick: 17B active, 128 experts (400B+ total), higher quality.",
      "explanation": "Scout and Maverick are the two variants of Meta's Llama 4 MoE family, released April 2025. Both use the same active parameter count but differ in total experts: Scout has 16 experts while Maverick has 128. The 16E/128E notation in the name tells you the expert count directly. More experts with the same active count means a larger total model requiring more memory but enabling more specialized routing per token.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "expert-routing",
        "moe",
        "token"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Llama 4 Scout model card",
          "url": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct"
        }
      ]
    },
    "tag-vision": {
      "id": "tag-vision",
      "name": "-V / -VL / -Vision",
      "expansion": "Vision-language multimodal",
      "category": "model-naming",
      "oneliner": "Accepts image (and sometimes video) inputs. -VL (Qwen), -Vision (Llama). Has an added vision encoder (often ViT) + projection layers.",
      "explanation": "A -Vision or -VL tag indicates a multimodal model that accepts both text and image inputs. Qwen2.5-VL-7B, Llama-3.2-11B-Vision, and InternVL are examples. These are architecturally distinct from text-only variants because they include a vision encoder (often a ViT) plus a projection layer that maps image patches into the token embedding space. Parameter counts usually include the vision encoder.",
      "seen_in": [
        "model-name"
      ],
      "related": [
        "mha",
        "token"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Llama 3.2 Vision model card",
          "url": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct"
        }
      ]
    },
    "temperature": {
      "id": "temperature",
      "name": "Temperature",
      "expansion": "Temperature (Sampling Temperature)",
      "category": "sampling-decoding",
      "oneliner": "A parameter that controls randomness in token selection — low temperature makes output deterministic, high temperature makes it creative and diverse.",
      "explanation": "Temperature is a scaling factor applied to the model's output logits before the softmax that converts them into probabilities. At temperature 1.0, the original probability distribution is used. Lower values (0.1-0.5) sharpen the distribution, making the model more likely to pick the highest-probability token. Higher values (1.0-2.0) flatten it, giving lower-probability tokens a better chance. Temperature 0 is equivalent to greedy decoding — always picking the top token.",
      "related": [
        "top-p",
        "top-k",
        "greedy-decoding",
        "inference"
      ],
      "seen_in": [
        "model-config",
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Decoding strategies",
          "url": "https://huggingface.co/blog/mlabonne/decoding-strategies"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "test-time-compute": {
      "id": "test-time-compute",
      "name": "Test-Time Compute Scaling",
      "expansion": "Test-Time Compute Scaling",
      "category": "scaling-patterns",
      "oneliner": "Spending more compute at inference rather than training — letting the model 'think longer' on hard problems to improve accuracy without retraining.",
      "explanation": "Test-time compute scaling is the principle that spending more computation at inference can improve accuracy more effectively than scaling model parameters. You let a smaller model generate more reasoning tokens, sample multiple solutions, or search over candidates. Snell et al. showed a compute-optimal strategy can match a 14x larger model. This underpins reasoning models: o1 and R1 adaptively spend more tokens on harder problems, making cost proportional to difficulty.",
      "fundamentals": "Two main mechanisms: (1) search against verifiers — generate $k$ candidates, score with a Process Reward Model $r(z_i)$, select best; (2) sequential revision — model iteratively refines its own answer. Compute-optimal allocation: for easy prompts, revision dominates; for hard prompts, parallel search dominates. FLOPs budget: $C_{\\text{inference}} = k \\cdot T \\cdot c_{\\text{fwd}}$ where $k$ = samples, $T$ = tokens per sample, $c_{\\text{fwd}}$ = cost per forward token. Diminishing returns above ~64 samples for most tasks. Configurable effort levels (o3: low/medium/high) expose this tradeoff to users.",
      "related": [
        "reasoning-models",
        "chain-of-thought",
        "thinking-tokens",
        "inference"
      ],
      "seen_in": [
        "API-docs",
        "benchmarks",
        "research-papers"
      ],
      "foundational_papers": [
        {
          "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters",
          "authors": "Snell, Lee, Xu, Kumar",
          "venue": "ICLR 2025 (Oral)",
          "arxiv": "2408.03314"
        },
        {
          "title": "The Art of Scaling Test-Time Compute for Large Language Models",
          "authors": "Zhang et al.",
          "venue": "2024",
          "arxiv": "2512.02008"
        }
      ],
      "resources": [
        {
          "label": "Snell et al. test-time compute paper",
          "url": "https://arxiv.org/abs/2408.03314"
        },
        {
          "label": "Awesome Inference-Time Scaling (paper list)",
          "url": "https://github.com/ThreeSR/Awesome-Inference-Time-Scaling"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "thinking-tokens": {
      "id": "thinking-tokens",
      "name": "Thinking Tokens",
      "expansion": "Thinking Tokens / Extended Thinking",
      "category": "scaling-patterns",
      "oneliner": "Hidden reasoning tokens generated before the visible answer — they consume context window and billing but are discarded from the final output shown to users.",
      "explanation": "Thinking tokens are intermediate reasoning tokens that models generate before producing a visible answer. In OpenAI o1 and o3, they are hidden and only a summary is shown; in DeepSeek R1, the full trace appears between think tags. They occupy context window space and are billed as output tokens, ranging from hundreds for easy queries to tens of thousands for hard ones. This variable-length generation makes reasoning models adaptive, spending more compute on harder problems.",
      "fundamentals": "Generation pattern: $[\\texttt{<think>}\\; z_1, z_2, \\ldots, z_T\\; \\texttt{</think>}\\; y]$. Context budget: $C_{\\text{ctx}} = |\\text{input}| + |\\text{thinking}| + |\\text{output}|$ must fit model window (e.g., 128K for o1, 128K for R1). API billing: thinking tokens count as output tokens. OpenAI hides raw thinking tokens, shows model-generated summary. Anthropic's Claude extended thinking exposes the full trace. Configurable thinking budgets: o3 low/medium/high, Claude \\texttt{budget\\_tokens} parameter. R1 uses \\texttt{<think>}...\\texttt{</think>} delimiters in the actual generation.",
      "related": [
        "reasoning-models",
        "chain-of-thought",
        "test-time-compute",
        "kv-cache"
      ],
      "seen_in": [
        "API-docs",
        "model-cards",
        "billing-docs"
      ],
      "foundational_papers": [
        {
          "title": "Learning to reason with LLMs (OpenAI o1 system card)",
          "authors": "OpenAI",
          "venue": "2024",
          "arxiv": ""
        },
        {
          "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
          "authors": "DeepSeek-AI",
          "venue": "2025",
          "arxiv": "2501.12948"
        }
      ],
      "resources": [
        {
          "label": "OpenAI reasoning tokens documentation",
          "url": "https://platform.openai.com/docs/guides/reasoning"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "token": {
      "id": "token",
      "name": "Token",
      "expansion": "Token (subword unit)",
      "category": "quantization-basics",
      "oneliner": "The smallest text unit an LLM reads and generates — typically a subword piece, not a full word.",
      "explanation": "A token is the atomic unit of text that a language model processes. A tokenizer splits input text into subword pieces drawn from a fixed vocabulary, so the word 'unbelievable' might become three tokens like 'un', 'believ', 'able'. Vocabulary sizes vary: LLaMA uses 32,000 tokens while GPT-4 uses around 100,000. Larger vocabularies compress text more efficiently but increase the model's embedding table size.",
      "fundamentals": "Byte-Pair Encoding starts with a character-level vocabulary and iteratively merges the most frequent adjacent pair into a new token. After $k$ merges the vocabulary has $|V_0| + k$ entries. The compression ratio — characters per token — typically lands between 3 and 5 for English. The final softmax over the vocabulary has cost $O(|V| \\cdot d_{\\text{model}})$, so doubling $|V|$ roughly doubles that layer's compute.",
      "related": [
        "tokenizer-bpe",
        "tokenizer-spm"
      ],
      "seen_in": [
        "model-config",
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Neural Machine Translation of Rare Words with Subword Units",
          "authors": "Sennrich et al.",
          "venue": "ACL 2016",
          "arxiv": "1508.07909"
        }
      ],
      "resources": [
        {
          "label": "OpenAI Tokenizer Tool",
          "url": "https://platform.openai.com/tokenizer"
        },
        {
          "label": "Andrej Karpathy — minbpe tokenizer",
          "url": "https://github.com/karpathy/minbpe"
        },
        {
          "label": "3Blue1Brown — Transformers",
          "url": "https://www.3blue1brown.com/lessons/gpt"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tokenizer-bpe": {
      "id": "tokenizer-bpe",
      "name": "BPE",
      "expansion": "Byte-Pair Encoding",
      "category": "tokenizers",
      "oneliner": "Dominant subword tokenization. Iteratively merge most frequent adjacent pairs. Byte-level BPE (GPT-2+) handles any text with no unknown tokens.",
      "explanation": "Byte-Pair Encoding is the dominant tokenization algorithm for modern LLMs. It starts with a character or byte-level vocabulary and iteratively merges the most frequent adjacent pair into a new token, repeating until the desired vocabulary size is reached. Byte-level BPE, introduced with GPT-2, can encode any text without unknown tokens. The resulting vocabulary balances compression efficiency with handling rare and novel words. Used by GPT, Llama, and Mistral.",
      "seen_in": [
        "documentation",
        "tokenizer-config"
      ],
      "related": [
        "token",
        "tokenizer-spm",
        "tokenizer-tiktoken"
      ],
      "foundational_papers": [
        {
          "title": "Neural Machine Translation of Rare Words with Subword Units",
          "authors": "Sennrich et al.",
          "venue": "ACL 2016",
          "arxiv": "1508.07909"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10"
    },
    "tokenizer-spm": {
      "id": "tokenizer-spm",
      "name": "SentencePiece (SPM)",
      "expansion": "Google's language-independent tokenizer",
      "category": "tokenizers",
      "oneliner": "Treats text as raw Unicode stream (no pre-tokenization). Two modes: BPE and Unigram. Outputs .model/.vocab files. Used by Llama 1/2 (BPE), T5 (Unigram).",
      "seen_in": [
        "filename",
        "tokenizer-config"
      ],
      "related": [
        "tokenizer-bpe"
      ],
      "foundational_papers": [
        {
          "title": "SentencePiece: A simple and language independent subword tokenizer",
          "authors": "Kudo & Richardson",
          "venue": "EMNLP 2018",
          "arxiv": "1808.06226"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "SentencePiece is a language-independent tokenizer that operates directly on raw Unicode text without requiring pre-tokenization or whitespace splitting. It supports two subword algorithms: Byte-Pair Encoding and Unigram Language Model. Llama 1 and Llama 2 use its BPE mode, while T5 uses Unigram. Because it treats input as a raw byte stream, it handles any language or script without special preprocessing rules."
    },
    "tokenizer-tiktoken": {
      "id": "tokenizer-tiktoken",
      "name": "tiktoken",
      "expansion": "OpenAI's fast BPE tokenizer (Rust + Python)",
      "category": "tokenizers",
      "oneliner": "Significantly faster than HuggingFace tokenizers. Pre-compiled merge tables (cl100k_base for GPT-3.5/4, o200k_base for GPT-4o). Also used by Qwen.",
      "seen_in": [
        "code",
        "tokenizer-config"
      ],
      "related": [
        "tokenizer-bpe"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "tiktoken GitHub",
          "url": "https://github.com/openai/tiktoken"
        }
      ],
      "explanation": "Tiktoken is OpenAI's open-source tokenizer library, written in Rust with Python bindings, making it significantly faster than alternatives like HuggingFace tokenizers. It uses pre-compiled BPE merge tables rather than learning them at runtime. Tiktoken powers the tokenization for GPT-3.5, GPT-4, and the Qwen model family. Its speed advantage matters most in applications that need to count or split tokens at high throughput."
    },
    "tokenizer-wordpiece": {
      "id": "tokenizer-wordpiece",
      "name": "WordPiece",
      "expansion": "BERT's subword tokenizer",
      "category": "tokenizers",
      "oneliner": "Like BPE but merges by maximum likelihood (mutual information) not frequency. Continuation tokens prefixed with ##. Used by BERT, DistilBERT, Electra.",
      "seen_in": [
        "tokenizer-config"
      ],
      "related": [
        "tokenizer-bpe"
      ],
      "foundational_papers": [
        {
          "title": "BERT: Pre-training of Deep Bidirectional Transformers",
          "authors": "Devlin et al.",
          "venue": "NAACL 2019",
          "arxiv": "1810.04805"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "WordPiece is a subword tokenization algorithm that selects merges by maximizing the likelihood of the training corpus rather than simply picking the most frequent pair, as BPE does. Continuation tokens are marked with a ## prefix, so 'playing' might become 'play' and '##ing'. It is the tokenizer behind BERT, DistilBERT, and most models in the original Transformer encoder family from Google."
    },
    "tool-accelerate": {
      "id": "tool-accelerate",
      "name": "Accelerate",
      "expansion": "HuggingFace Accelerate — Distributed Training Library",
      "category": "serving-tools",
      "oneliner": "HuggingFace's library that makes PyTorch training code run on any hardware setup — single GPU, multi-GPU, or TPU — with minimal code changes.",
      "explanation": "Accelerate is a HuggingFace library that abstracts away distributed training complexity. You write standard PyTorch code, and Accelerate distributes it across multiple GPUs, TPU pods, or mixed-precision setups with minimal wrapper calls. It supports DeepSpeed ZeRO, FSDP, and gradient accumulation. For LLM fine-tuning, Accelerate is the layer between your training script and the hardware, used internally by HuggingFace Trainer and TRL.",
      "fundamentals": "Core pattern: wrap model, optimizer, and dataloader with `accelerator.prepare()`, replace `loss.backward()` with `accelerator.backward(loss)`. Accelerate auto-detects available hardware and configures the appropriate distributed backend (DDP, FSDP, DeepSpeed). Configuration via `accelerate config` CLI or YAML. Supports mixed precision (fp16, bf16, fp8), gradient checkpointing, and model sharding.",
      "related": [
        "zero",
        "full-ft",
        "lora",
        "qlora"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Accelerate docs",
          "url": "https://huggingface.co/docs/accelerate"
        },
        {
          "label": "Accelerate GitHub",
          "url": "https://github.com/huggingface/accelerate"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-crewai": {
      "id": "tool-crewai",
      "name": "CrewAI",
      "expansion": "CrewAI — multi-agent orchestration framework",
      "category": "serving-tools",
      "oneliner": "Python framework for orchestrating teams of role-playing AI agents. Each agent has a role, goal, and tools; tasks flow through a structured crew pipeline.",
      "explanation": "CrewAI is an open-source Python framework for building multi-agent systems where each agent is assigned a specific role, goal, and backstory. Agents collaborate by passing task outputs along a defined workflow, with support for sequential, parallel, and hierarchical execution. Unlike LangChain which is a general-purpose LLM framework, CrewAI is purpose-built for the multi-agent pattern: define agents, assign tasks with dependencies, and let the crew execute end-to-end.",
      "fundamentals": "Core concepts: Agent (role + goal + backstory + tools), Task (description + expected output + assigned agent), Crew (collection of agents + tasks + process type). Process types: sequential (tasks run in order), hierarchical (a manager agent delegates to workers and aggregates results). Agents can use any tool that follows the BaseTool interface. Memory: short-term (within a crew run), long-term (across runs via embeddings), and entity memory. CrewAI Flows add event-driven orchestration on top of crews for complex multi-crew pipelines. Enterprise edition adds observability, deployment, and access control.",
      "related": [
        "agentic-ai",
        "function-calling",
        "tool-langchain"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "CrewAI website",
          "url": "https://crewai.com/"
        },
        {
          "label": "CrewAI docs",
          "url": "https://docs.crewai.com/"
        },
        {
          "label": "CrewAI GitHub",
          "url": "https://github.com/crewAIInc/crewAI"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-dask": {
      "id": "tool-dask",
      "name": "Dask",
      "expansion": "Dask — Parallel Computing Library",
      "category": "serving-tools",
      "oneliner": "A Python parallel computing library that scales pandas, NumPy, and scikit-learn workflows to clusters, used for large-scale data preprocessing in LLM pipelines.",
      "explanation": "Dask is a parallel computing library that scales pandas, NumPy, and scikit-learn workflows to datasets larger than memory and across clusters. In LLM pipelines, Dask is primarily used for data preprocessing — cleaning trillion-token training corpora, computing dataset statistics, and managing large-scale tokenization. While not directly involved in model training or inference, Dask handles the data engineering that feeds LLM training.",
      "fundamentals": "Dask represents computations as directed acyclic graphs (DAGs) of tasks. A Dask DataFrame partitions data into pandas DataFrames across workers. Lazy evaluation: operations build the graph without executing until `.compute()` is called. Schedulers: single-machine threaded/multiprocess, or distributed across a cluster via `dask.distributed`. Integration with cloud storage (S3, GCS) makes it practical for TB-scale training data.",
      "related": [
        "pre-training",
        "tool-ray"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Dask docs",
          "url": "https://docs.dask.org"
        },
        {
          "label": "Dask GitHub",
          "url": "https://github.com/dask/dask"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-exllamav2": {
      "id": "tool-exllamav2",
      "name": "ExLlamaV2",
      "expansion": "High-performance CUDA inference engine",
      "category": "serving-tools",
      "oneliner": "By turboderp. EXL2 format with variable per-layer bitwidth. Very fast on NVIDIA consumer GPUs. No CPU support.",
      "seen_in": [
        "serving-config"
      ],
      "related": [
        "exl2",
        "inference",
        "inference-engine",
        "org-turboderp",
        "quantization",
        "token"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "ExLlamaV2 GitHub",
          "url": "https://github.com/turboderp/exllamav2"
        }
      ],
      "explanation": "ExLlamaV2 is a CUDA-optimized inference engine built by turboderp, focused on maximum speed on NVIDIA consumer GPUs like the RTX 3090 and 4090. It introduced the EXL2 quantization format, which assigns different bit widths to different layers based on their sensitivity, squeezing more quality out of a given memory budget. Benchmarks often show it delivering the fastest single-GPU token generation for models that fit in VRAM."
    },
    "tool-langchain": {
      "id": "tool-langchain",
      "name": "LangChain",
      "expansion": "LangChain — LLM application framework",
      "category": "serving-tools",
      "oneliner": "The most widely adopted Python/JS framework for building LLM-powered applications. Chains, agents, RAG pipelines, and LangGraph for stateful agent orchestration.",
      "explanation": "LangChain is an open-source framework for building applications powered by language models. It provides composable abstractions for prompts, models, output parsers, retrievers, and tool calling, letting developers wire together chains (sequential LLM calls) and agents (LLM-driven tool-use loops). LangGraph, its companion library, models agent workflows as state machines (graphs) with durable state, human-in-the-loop checkpoints, and long-running execution. LangChain 1.",
      "fundamentals": "Core abstractions: ChatModel (provider-agnostic LLM interface), PromptTemplate, OutputParser, Retriever, Tool. A Chain is a fixed sequence of steps; an Agent uses the LLM to decide which tools to call dynamically. LangGraph extends this with nodes (functions), edges (control flow), and state objects that persist across steps. LangSmith provides observability (tracing, evaluation). The ecosystem includes 700+ integrations via langchain-community. Criticism: early versions were over-abstracted; v0.3+ and LangGraph addressed this by favouring explicit graphs over magic chains.",
      "related": [
        "agentic-ai",
        "function-calling",
        "rag",
        "tool-llamaindex"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "LangChain docs",
          "url": "https://python.langchain.com/docs/"
        },
        {
          "label": "LangGraph docs",
          "url": "https://langchain-ai.github.io/langgraph/"
        },
        {
          "label": "LangChain GitHub",
          "url": "https://github.com/langchain-ai/langchain"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-langfuse": {
      "id": "tool-langfuse",
      "name": "LangFuse",
      "expansion": "LangFuse — Open-Source LLM Observability",
      "category": "serving-tools",
      "oneliner": "An open-source alternative to LangSmith for tracing LLM calls, managing prompts, running evaluations, and tracking costs — self-hostable and framework-agnostic.",
      "explanation": "LangFuse is an open-source LLM observability platform that provides tracing, prompt management, evaluation, and cost tracking for LLM applications. It captures the full execution trace of every request, including nested LLM calls, retrieval steps, and tool invocations. LangFuse is framework-agnostic with integrations for LangChain, LlamaIndex, OpenAI SDK, and others. Being open-source and self-hostable, it appeals to teams that need data privacy or want to avoid vendor lock-in.",
      "related": [
        "llmops",
        "observability",
        "evals",
        "tool-langsmith"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "LangFuse GitHub",
          "url": "https://github.com/langfuse/langfuse"
        },
        {
          "label": "LangFuse docs",
          "url": "https://langfuse.com/docs"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-langsmith": {
      "id": "tool-langsmith",
      "name": "LangSmith",
      "expansion": "LangSmith — LLM Observability Platform",
      "category": "serving-tools",
      "oneliner": "LangChain's platform for tracing, evaluating, and monitoring LLM application calls in production — the most adopted LLMOps tool.",
      "explanation": "LangSmith is an observability and evaluation platform from LangChain for managing LLM applications in production. It traces every LLM call, tool invocation, and retrieval step with full input/output visibility. Developers use it to debug agent failures by replaying exact traces, run automated evaluations against prompt changes, manage prompt versions, and track cost and latency. LangSmith works with any LLM provider (not just LangChain) via a lightweight SDK.",
      "related": [
        "llmops",
        "observability",
        "tool-langchain",
        "evals"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "LangSmith docs",
          "url": "https://docs.smith.langchain.com"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-llamacpp": {
      "id": "tool-llamacpp",
      "name": "llama.cpp",
      "expansion": "C/C++ inference engine by Georgi Gerganov",
      "category": "serving-tools",
      "oneliner": "Runs LLMs on CPU (+ GPU offload). Defines GGUF format. Broad hardware support (x86, ARM, CUDA, Metal, Vulkan). Foundation of local LLM ecosystem.",
      "seen_in": [
        "documentation"
      ],
      "related": [
        "gguf",
        "inference",
        "inference-engine",
        "iq-quants",
        "k-quants"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "llama.cpp GitHub",
          "url": "https://github.com/ggerganov/llama.cpp"
        }
      ],
      "explanation": "llama.cpp is a C/C++ inference engine created by Georgi Gerganov that makes it possible to run large language models on consumer hardware, including CPUs. It defined the GGUF file format, which has become the standard for quantized model distribution. Users can offload selected layers to a GPU for extra speed while keeping the rest in system RAM. Its efficiency and portability spawned an entire ecosystem of local-inference tools built on top of it."
    },
    "tool-llamaindex": {
      "id": "tool-llamaindex",
      "name": "LlamaIndex",
      "expansion": "LlamaIndex — data framework for LLM applications",
      "category": "serving-tools",
      "oneliner": "Python framework optimised for connecting LLMs to your data. Best-in-class RAG pipelines, document agents, and structured data querying out of the box.",
      "explanation": "LlamaIndex is an open-source data framework that simplifies building retrieval-augmented generation and agentic applications over private data. It provides connectors for 160+ data sources, handles chunking, indexing, and embedding, and orchestrates retrieval and synthesis at query time. Its Workflows API lets developers build multi-step agentic pipelines with branching, loops, and parallel execution.",
      "fundamentals": "Pipeline: DataConnector (loads docs) -> NodeParser (chunks) -> Embedding model -> VectorStoreIndex (stores embeddings) -> Retriever (fetches top-k) -> ResponseSynthesizer (LLM generates answer). Advanced features: agentic RAG (per-document agents with a top-level router), sub-question decomposition, knowledge graph indices, and structured output extraction. Workflows API replaces the older query-engine pattern with explicit step functions, state passing, and event-driven control flow. LlamaParse handles complex PDF/table extraction. LlamaCloud offers managed indexing and retrieval.",
      "related": [
        "agentic-ai",
        "rag",
        "tool-langchain",
        "function-calling"
      ],
      "seen_in": [
        "documentation",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "LlamaIndex docs",
          "url": "https://docs.llamaindex.ai/"
        },
        {
          "label": "LlamaIndex GitHub",
          "url": "https://github.com/run-llama/llama_index"
        },
        {
          "label": "LlamaIndex website",
          "url": "https://www.llamaindex.ai/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-lmstudio": {
      "id": "tool-lmstudio",
      "name": "LM Studio",
      "expansion": "Desktop GUI for local LLMs",
      "category": "serving-tools",
      "oneliner": "GUI app (Win/Mac/Linux) for discovering, downloading, running local models. Built on llama.cpp + MLX. Model search, chat UI, local server.",
      "seen_in": [
        "documentation"
      ],
      "related": [
        "tool-llamacpp"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "LM Studio website",
          "url": "https://lmstudio.ai"
        }
      ],
      "explanation": "LM Studio is a desktop GUI application that lets users browse, download, and run open-weight language models locally without touching a terminal. Under the hood it uses llama.cpp on Windows and Linux, and Apple MLX on Mac, choosing the fastest backend automatically. It provides a chat interface, a local OpenAI-compatible API server, and model search across HuggingFace. It has become one of the most popular ways non-developers interact with local LLMs."
    },
    "tool-n8n": {
      "id": "tool-n8n",
      "name": "n8n",
      "expansion": "n8n — workflow automation platform",
      "category": "serving-tools",
      "oneliner": "Open-source, self-hostable workflow automation platform with a visual builder. Connects 400+ apps and now supports AI agent nodes with LLM tool calling.",
      "explanation": "n8n is an open-source workflow automation platform that lets users visually connect triggers, logic, and integrations across hundreds of services. Since 2024 it has added first-class AI agent nodes that embed LLM reasoning directly into workflows: an agent node can call tools (other n8n nodes), maintain conversational memory, and make multi-step decisions.",
      "fundamentals": "Architecture: event-driven DAG of nodes. Each node is a trigger, action, or logic gate. AI Agent node wraps an LLM (OpenAI, Anthropic, Gemini, Ollama) with a system prompt and a set of tool-nodes it can invoke. Execution is visual and inspectable: every prompt, model response, and tool result is logged per run. Supports self-hosting (Docker, Kubernetes) or n8n Cloud. Workflows can be versioned, shared, and triggered via webhooks, cron, or app events. Community library has 1000+ workflow templates including RAG pipelines, email classifiers, and multi-agent patterns.",
      "related": [
        "agentic-ai",
        "function-calling"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "n8n website",
          "url": "https://n8n.io/"
        },
        {
          "label": "n8n AI features",
          "url": "https://n8n.io/ai/"
        },
        {
          "label": "n8n GitHub",
          "url": "https://github.com/n8n-io/n8n"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-ollama": {
      "id": "tool-ollama",
      "name": "Ollama",
      "expansion": "User-friendly llama.cpp wrapper",
      "category": "serving-tools",
      "oneliner": "'ollama run llama3' downloads and runs in one command. Local REST API, Modelfiles, background service. Simplest local LLM setup.",
      "seen_in": [
        "documentation"
      ],
      "related": [
        "adapters",
        "gguf",
        "tool-llamacpp"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Ollama GitHub",
          "url": "https://github.com/ollama/ollama"
        },
        {
          "label": "Ollama website",
          "url": "https://ollama.com"
        }
      ],
      "explanation": "Ollama is a command-line tool and background service that wraps llama.cpp into a seamless local experience. A single command like 'ollama run llama3' downloads the model weights and starts an interactive chat. It exposes a local REST API compatible with the OpenAI format, so apps can swap between local and cloud models easily. Modelfiles let users customize system prompts, parameters, and adapters in a Dockerfile-like syntax."
    },
    "tool-outlines": {
      "id": "tool-outlines",
      "name": "Outlines",
      "expansion": "Outlines — Grammar-Guided Generation Library",
      "category": "serving-tools",
      "oneliner": "A Python library that implements grammar-guided constrained decoding, guaranteeing LLM outputs conform to JSON schemas, regex patterns, or arbitrary grammars.",
      "explanation": "Outlines is a library from dottxt that forces LLM output to conform to a specified format by masking invalid tokens at each generation step. You provide a JSON Schema, Pydantic model, regex pattern, or context-free grammar, and Outlines computes which tokens are valid continuations at each position, setting all others to negative infinity before sampling. This guarantees structurally valid output without retries or post-processing. It integrates with vLLM, llama.cpp, and Transformers.",
      "related": [
        "structured-output",
        "pydantic-ai",
        "function-calling",
        "tool-vllm"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Efficient Guided Generation for Large Language Models",
          "authors": "Willard & Louf",
          "venue": "2023",
          "arxiv": "2307.09702"
        }
      ],
      "resources": [
        {
          "label": "Outlines GitHub",
          "url": "https://github.com/dottxt-ai/outlines"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-ray": {
      "id": "tool-ray",
      "name": "Ray",
      "expansion": "Ray — Distributed Computing Framework",
      "category": "serving-tools",
      "oneliner": "A general-purpose distributed computing framework used to scale LLM training, serving, and data processing across clusters.",
      "explanation": "Ray is an open-source framework for scaling Python applications across clusters. In the LLM ecosystem, Ray Serve powers vLLM's multi-node deployment, Ray Train handles distributed fine-tuning, and Ray Data processes large training datasets. It provides a simple API where decorating a function distributes it across workers automatically. Many production LLM deployments use Ray for cluster orchestration under higher-level tools.",
      "fundamentals": "Ray uses an actor-based distributed execution model. Tasks (`@ray.remote` functions) and actors (`@ray.remote` classes) are scheduled across a cluster by a centralized GCS (Global Control Store). Ray Serve builds on this for model serving with autoscaling and request batching. Ray Train wraps PyTorch distributed training with fault tolerance and checkpoint management.",
      "related": [
        "tool-vllm",
        "zero",
        "tp-pp"
      ],
      "seen_in": [
        "code",
        "documentation",
        "serving-config"
      ],
      "foundational_papers": [
        {
          "title": "Ray: A Distributed Framework for Emerging AI Applications",
          "authors": "Moritz et al.",
          "venue": "OSDI 2018",
          "arxiv": "1712.05889"
        }
      ],
      "resources": [
        {
          "label": "Ray docs",
          "url": "https://docs.ray.io"
        },
        {
          "label": "Ray GitHub",
          "url": "https://github.com/ray-project/ray"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-sglang": {
      "id": "tool-sglang",
      "name": "SGLang",
      "expansion": "Structured Generation Language",
      "category": "serving-tools",
      "oneliner": "Serving backend + frontend DSL. RadixAttention (KV cache reuse via radix tree), fast constrained decoding (JSON/regex). Competitive with vLLM throughput.",
      "seen_in": [
        "serving-config"
      ],
      "related": [
        "kv-cache",
        "tool-vllm"
      ],
      "foundational_papers": [
        {
          "title": "SGLang: Efficient Execution of Structured Language Model Programs",
          "authors": "Zheng et al.",
          "venue": "2024",
          "arxiv": "2312.07104"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "SGLang is both a serving backend and a frontend programming language for LLM applications. Its key innovation, RadixAttention, reuses KV cache across requests that share common prefixes, which speeds up workloads like multi-turn chat and branching prompts. The frontend DSL lets developers express complex generation patterns in clean Python. It consistently ranks among the fastest open-source serving engines in benchmarks."
    },
    "tool-tensorrt-llm": {
      "id": "tool-tensorrt-llm",
      "name": "TensorRT-LLM",
      "expansion": "NVIDIA's optimized LLM inference",
      "category": "serving-tools",
      "oneliner": "Compiles models into optimized TensorRT engines. FP8, INT4 AWQ/GPTQ, SmoothQuant, paged KV cache, multi-GPU. Maximum performance on NVIDIA datacenter GPUs.",
      "seen_in": [
        "serving-config"
      ],
      "related": [
        "fp8",
        "int4",
        "quantization",
        "w8a8"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "TensorRT-LLM GitHub",
          "url": "https://github.com/NVIDIA/TensorRT-LLM"
        }
      ],
      "explanation": "TensorRT-LLM is NVIDIA's official library for optimizing and serving large language models on NVIDIA GPUs. It compiles models into highly optimized TensorRT engines using techniques like FP8 and INT4 quantization, paged KV caches, and in-flight batching. It supports multi-GPU and multi-node setups via tensor and pipeline parallelism. Deploying requires an ahead-of-time compilation step, but the resulting throughput on NVIDIA hardware is typically best-in-class."
    },
    "tool-tgi": {
      "id": "tool-tgi",
      "name": "TGI",
      "expansion": "Text Generation Inference (HuggingFace)",
      "category": "serving-tools",
      "oneliner": "HuggingFace's production server. Rust router + Python model serving. Continuous batching, Flash Attention, grammar-constrained generation. Docker-first.",
      "seen_in": [
        "serving-config"
      ],
      "related": [
        "continuous-batching",
        "inference",
        "quantization",
        "token",
        "tool-vllm"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "TGI GitHub",
          "url": "https://github.com/huggingface/text-generation-inference"
        }
      ],
      "explanation": "Text Generation Inference (TGI) is HuggingFace's production-grade serving solution for large language models. It pairs a high-performance Rust-based HTTP router with Python model backends, handling continuous batching, token streaming, and quantization out of the box. TGI integrates tightly with the HuggingFace Hub, making it straightforward to deploy any compatible model. It powers the Inference API behind many HuggingFace-hosted endpoints."
    },
    "tool-triton": {
      "id": "tool-triton",
      "name": "Triton (OpenAI)",
      "expansion": "Triton — Python-based GPU Kernel Language",
      "category": "serving-tools",
      "oneliner": "A Python language for writing custom GPU kernels without CUDA, used to build high-performance attention and quantization kernels for LLMs.",
      "explanation": "Triton is an open-source programming language from OpenAI that lets developers write GPU kernels in Python instead of CUDA C++. It handles memory management, thread scheduling, and hardware optimizations automatically. Many key LLM kernels are written in Triton, including Flash Attention implementations and custom quantization kernels in vLLM. It compiles to optimized GPU code for NVIDIA and AMD hardware.",
      "fundamentals": "Triton programs use a block-based programming model: each kernel instance processes a tile of data specified by `tl.program_id` and block sizes. The compiler auto-tunes block dimensions, memory coalescing, and shared memory usage. Key operations: `tl.load`, `tl.store`, `tl.dot` (matrix multiply), `tl.sum`. A Flash Attention kernel in Triton is ~100 lines vs ~1000 in CUDA.",
      "related": [
        "flash-attention",
        "marlin",
        "tool-vllm"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations",
          "authors": "Tillet et al.",
          "venue": "MAPL 2019",
          "arxiv": "1907.00587"
        }
      ],
      "resources": [
        {
          "label": "Triton GitHub",
          "url": "https://github.com/triton-lang/triton"
        },
        {
          "label": "Triton tutorials",
          "url": "https://triton-lang.org/main/getting-started/tutorials/index.html"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tool-vllm": {
      "id": "tool-vllm",
      "name": "vLLM",
      "expansion": "vLLM — high-throughput LLM serving",
      "category": "serving-tools",
      "oneliner": "PagedAttention + continuous batching. The default open-source production serving engine. Supports TP, GPTQ/AWQ/FP8, speculative decoding.",
      "seen_in": [
        "serving-config",
        "documentation"
      ],
      "related": [
        "continuous-batching",
        "inference",
        "inference-engine",
        "paged-attention"
      ],
      "foundational_papers": [
        {
          "title": "Efficient Memory Management for LLM Serving with PagedAttention",
          "authors": "Kwon et al.",
          "venue": "SOSP 2023",
          "arxiv": "2309.06180"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "explanation": "vLLM is an open-source inference engine for serving large language models in production. It introduced PagedAttention, which manages GPU memory for key-value caches the way an operating system pages virtual memory, dramatically reducing waste. Combined with continuous batching, it achieves throughput several times higher than naive HuggingFace generation. vLLM has become the default serving backend for most open-weight deployments and supports tensor parallelism across multiple GPUs."
    },
    "top-k": {
      "id": "top-k",
      "name": "Top-k Sampling",
      "expansion": "Top-k Sampling",
      "category": "sampling-decoding",
      "oneliner": "A decoding strategy that restricts token selection to the k highest-probability tokens at each step, preventing low-probability nonsense tokens from being chosen.",
      "explanation": "Top-k sampling limits the candidate pool at each generation step to the k tokens with the highest probabilities, then samples from that reduced set. This prevents the model from occasionally picking extremely unlikely tokens that produce incoherent text. Common values are k=40 to k=100. The limitation is that k is fixed regardless of context — a confident prediction and an uncertain one both consider the same number of candidates.",
      "related": [
        "temperature",
        "top-p",
        "greedy-decoding"
      ],
      "seen_in": [
        "model-config",
        "code"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "HuggingFace — Decoding strategies",
          "url": "https://huggingface.co/blog/mlabonne/decoding-strategies"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "top-p": {
      "id": "top-p",
      "name": "Top-p / Nucleus Sampling",
      "expansion": "Top-p (Nucleus) Sampling",
      "category": "sampling-decoding",
      "oneliner": "A decoding strategy that samples from the smallest set of tokens whose cumulative probability exceeds a threshold p, adapting vocabulary size dynamically.",
      "explanation": "Top-p sampling, also called nucleus sampling, selects the smallest set of tokens whose cumulative probability mass exceeds a threshold p (typically 0.9-0.95). Unlike top-k which always considers a fixed number of tokens regardless of confidence, top-p adapts: when the model is confident, it considers few tokens; when uncertain, it considers many. This produces more natural and diverse text than top-k alone. Most LLM APIs use top-p as the default sampling strategy alongside temperature.",
      "related": [
        "temperature",
        "top-k",
        "greedy-decoding"
      ],
      "seen_in": [
        "model-config",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "The Curious Case of Neural Text Degeneration",
          "authors": "Holtzman et al.",
          "venue": "ICLR 2020",
          "arxiv": "1904.09751"
        }
      ],
      "resources": [
        {
          "label": "HuggingFace — Decoding strategies",
          "url": "https://huggingface.co/blog/mlabonne/decoding-strategies"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "tp-pp": {
      "id": "tp-pp",
      "name": "Tensor Parallelism (TP) vs Pipeline Parallelism (PP)",
      "expansion": "TP = Tensor (Intra-Layer) Parallelism; PP = Pipeline (Inter-Layer) Parallelism",
      "category": "scaling-patterns",
      "oneliner": "TP splits each layer across GPUs (every GPU computes a slice of every layer). PP assigns different layers to different GPUs (sequential pipeline). TP within nodes, PP across nodes.",
      "explanation": "Tensor parallelism and pipeline parallelism are two strategies for distributing a model across multiple GPUs. Tensor parallelism splits each layer's weight matrices across GPUs so every GPU computes a slice of every layer, requiring fast interconnects within a node. Pipeline parallelism assigns entire layers to different GPUs in sequence, needing less bandwidth but suffering idle time between pipeline stages. Large-scale training typically combines both.",
      "fundamentals": "TP: W split column-wise into [$W_1$|...|$W_p$]. GPU_i computes Y_i = X·W_i. AllGather to reconstruct or AllReduce to sum. 2 AllReduces per layer, each ~2M bytes for hidden_dim M. PP: micro-batch pipeline. Bubble = (P-1)/(M+P-1). For P=8, M=32: 18% bubble. Interleaved schedules: non-contiguous layer assignment reduces bubbles by factor v. Inference: prefer TP (minimizes latency — all GPUs work on every token). Training: 3D parallelism.",
      "seen_in": [
        "serving-config",
        "training-config"
      ],
      "related": [
        "inference",
        "token"
      ],
      "sources": [
        "Narayanan et al., 'Megatron-LM,' arXiv:1909.08053, arXiv:2104.04473",
        "Huang et al., 'GPipe,' NeurIPS 2019, arXiv:1811.06965"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM",
          "authors": "Narayanan et al.",
          "venue": "2021",
          "arxiv": "2104.04473"
        },
        {
          "title": "GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism",
          "authors": "Huang et al.",
          "venue": "NeurIPS 2019",
          "arxiv": "1811.06965"
        }
      ]
    },
    "training-recipe": {
      "id": "training-recipe",
      "name": "Training Recipe",
      "expansion": "LLM Training Recipe (End-to-End Reproduction Guide)",
      "category": "training-pipeline",
      "oneliner": "The complete specification for reproducing a foundation model — data mix, tokenizer, hyperparameters, learning rate schedule, and infrastructure setup.",
      "explanation": "A training recipe is the full set of decisions and configurations needed to reproduce a language model from scratch. It covers data sourcing and filtering, tokenizer training, model architecture choices, hyperparameters, learning rate schedules, batch size ramp-up, hardware configuration, and checkpoint management. AI2's OLMo project pioneered fully open training recipes, publishing not just weights but the complete training code, data pipelines, and intermediate checkpoints.",
      "fundamentals": "Key recipe components: data mix (web text ratios, code, books, domain-specific), data cleaning (deduplication, quality filtering, toxicity removal), tokenizer (BPE vocabulary size, special tokens), architecture (layers, heads, hidden dim, GQA ratio), optimizer (AdamW, weight decay, betas), schedule (warmup steps, cosine decay, final LR), batch ramp (start small, increase), context length (may increase during training), infrastructure (GPU type, parallelism strategy, checkpointing frequency).",
      "related": [
        "pre-training",
        "scaling-laws",
        "mixed-precision-training",
        "zero"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "OLMo: Accelerating the Science of Language Models",
          "authors": "Groeneveld et al.",
          "venue": "ACL 2024",
          "arxiv": "2402.00838"
        }
      ],
      "resources": [
        {
          "label": "OLMo GitHub",
          "url": "https://github.com/allenai/OLMo"
        },
        {
          "label": "Llama 3 training details",
          "url": "https://arxiv.org/abs/2407.21783"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "transfer-learning": {
      "id": "transfer-learning",
      "name": "Transfer Learning",
      "expansion": "Transfer Learning",
      "category": "training-pipeline",
      "oneliner": "Reusing a model pretrained on a large general corpus as the starting point for a new, often smaller, downstream task.",
      "explanation": "Transfer learning is the practice of reusing a model trained on a broad dataset as the starting point for a specific task, rather than training from scratch. The pretrained weights already encode language structure and world knowledge, so the new task needs far less data and compute. This idea underlies all modern fine-tuning methods including full fine-tuning, LoRA, and QLoRA. Even zero-shot prompting is a form of transfer learning.",
      "fundamentals": "Transfer assumes features learned on $P_{\\text{source}}(x)$ are useful for $P_{\\text{target}}(x)$. Fine-tuning updates $\\theta_{\\text{pretrained}} \\to \\theta + \\Delta\\theta$ using a task-specific loss. PEFT methods constrain $\\Delta\\theta$: LoRA uses $\\Delta W = BA$ with $B \\in \\mathbb{R}^{d \\times r}$, $r \\ll d$, reducing trainable params from $O(d^2)$ to $O(rd)$.",
      "related": [
        "full-ft",
        "lora",
        "peft",
        "pre-training",
        "qlora",
        "sft"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "BERT: Pre-training of Deep Bidirectional Transformers",
          "authors": "Devlin et al.",
          "venue": "NAACL 2019",
          "arxiv": "1810.04805"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Ruder — Transfer Learning in NLP",
          "url": "https://ruder.io/transfer-learning/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "truthfulqa": {
      "id": "truthfulqa",
      "name": "TruthfulQA",
      "expansion": "TruthfulQA",
      "category": "scaling-patterns",
      "oneliner": "A benchmark testing whether LLMs generate truthful answers rather than repeating popular misconceptions — specifically targeting common human false beliefs.",
      "explanation": "TruthfulQA is a benchmark of 817 questions designed to test whether language models reproduce common misconceptions or generate truthful answers. Questions span health, law, finance, and politics where humans commonly hold false beliefs. Larger models paradoxically scored worse on early versions because they better learned the popular-but-wrong answers from training data. TruthfulQA is a standard safety evaluation measuring a model's tendency toward hallucination and misinformation.",
      "related": [
        "evals",
        "hallucination",
        "red-teaming"
      ],
      "seen_in": [
        "model-cards",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
          "authors": "Lin et al.",
          "venue": "ACL 2022",
          "arxiv": "2109.07958"
        }
      ],
      "resources": [
        {
          "label": "TruthfulQA GitHub",
          "url": "https://github.com/sylinrl/TruthfulQA"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "unsloth-dynamic": {
      "id": "unsloth-dynamic",
      "name": "Unsloth Dynamic (UD)",
      "expansion": "Unsloth Dynamic Quantization v2.0",
      "category": "quantization-methods",
      "oneliner": "Proprietary layer-selective quantization that varies precision per layer based on sensitivity analysis. 10-30% KL divergence reduction vs uniform.",
      "explanation": "Unsloth Dynamic is a variable-precision quantization strategy that assigns different bit precisions to different layers based on measured sensitivity. It profiles each layer using a large calibration set of 300K to 1.5M tokens and gives more bits to sensitive layers while compressing robust layers more aggressively. This claims 10 to 30 percent lower KL divergence compared to uniform quantization at the same average bitwidth. It is available for GGUF and MLX formats.",
      "fundamentals": "1) Profile: quantize each layer at multiple precisions, measure KL divergence from full-precision output. 2) Allocate: assign precision per layer to minimize total KL divergence within a size budget. 3) Quantize: apply chosen precision per layer. Similar in spirit to EXL2's approach but with different optimization objective (KL divergence vs reconstruction error).",
      "seen_in": [
        "gguf-filename",
        "repo-name"
      ],
      "related": [
        "exl2",
        "gguf",
        "quantization"
      ],
      "sources": [
        "unsloth.ai/blog/dynamic-v2"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "Unsloth Dynamic v2.0 blog post",
          "url": "https://unsloth.ai/blog/dynamic-v2"
        },
        {
          "label": "Unsloth GitHub",
          "url": "https://github.com/unslothai/unsloth"
        }
      ]
    },
    "vector-database": {
      "id": "vector-database",
      "name": "Vector Database",
      "expansion": "Vector Database (Vector Store)",
      "category": "embeddings-retrieval",
      "oneliner": "A specialized database optimized for storing and querying high-dimensional embedding vectors using approximate nearest neighbor search.",
      "explanation": "A vector database stores embedding vectors and supports fast similarity search over millions or billions of entries using approximate nearest neighbor algorithms. When a user asks a question in a RAG system, the query is embedded into a vector, and the database returns the most semantically similar document chunks. Popular options include Pinecone (managed), Weaviate, Qdrant, Milvus (open-source), and pgvector (PostgreSQL extension).",
      "related": [
        "embeddings",
        "rag",
        "tag-embed"
      ],
      "seen_in": [
        "code",
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Pinecone — What is a Vector Database?",
          "url": "https://www.pinecone.io/learn/vector-database/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "vit": {
      "id": "vit",
      "name": "ViT",
      "expansion": "Vision Transformer",
      "category": "layer-types",
      "oneliner": "A transformer that processes images by splitting them into fixed-size patches and treating each patch as a token — the dominant vision encoder in modern VLMs.",
      "explanation": "Vision Transformer applies the standard transformer architecture to images by dividing them into a grid of fixed-size patches (typically 14x14 or 16x16 pixels), flattening each patch into a vector, and projecting it into the model's embedding dimension. These patch embeddings are processed by standard transformer layers with self-attention, just like text tokens. ViT proved that transformers can match or beat CNNs for vision tasks when trained at scale.",
      "related": [
        "vlm",
        "clip",
        "self-attention",
        "backbone"
      ],
      "seen_in": [
        "model-config",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
          "authors": "Dosovitskiy et al.",
          "venue": "ICLR 2021",
          "arxiv": "2010.11929"
        }
      ],
      "resources": [
        {
          "label": "ViT paper",
          "url": "https://arxiv.org/abs/2010.11929"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "vlm": {
      "id": "vlm",
      "name": "VLM",
      "expansion": "Vision-Language Model",
      "category": "layer-types",
      "oneliner": "A model combining a vision encoder with an LLM backbone to understand and reason about images alongside text — LLaVA, Qwen-VL, Llama Vision.",
      "explanation": "A vision-language model pairs an image encoder (typically a Vision Transformer like SigLIP or CLIP) with a language model backbone via a projection layer that maps visual features into the LLM's token embedding space. This lets the model understand images as naturally as text — describing scenes, answering questions about visual content, and reasoning over diagrams. Popular VLMs include LLaVA, Qwen2-VL, Llama 3.2 Vision, and InternVL.",
      "related": [
        "tag-vision",
        "backbone",
        "model-head",
        "self-attention"
      ],
      "seen_in": [
        "model-config",
        "model-cards"
      ],
      "foundational_papers": [
        {
          "title": "Visual Instruction Tuning (LLaVA)",
          "authors": "Liu et al.",
          "venue": "NeurIPS 2023",
          "arxiv": "2304.08485"
        }
      ],
      "resources": [
        {
          "label": "ByteByteGo — Multimodal LLMs",
          "url": "https://blog.bytebytego.com/p/multimodal-llms-basics-how-llms-process"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "w4a16": {
      "id": "w4a16",
      "name": "W4A16",
      "expansion": "4-bit Weights, 16-bit Activations",
      "category": "quantization-basics",
      "oneliner": "Weight-only quantization: weights stored in 4-bit, dequantized to fp16 on-the-fly for computation. Saves memory, compute stays fp16.",
      "explanation": "W4A16 is a quantization configuration where model weights are stored in 4-bit precision but activations remain in 16-bit during computation. The weights are decompressed to fp16 on the fly as each layer runs, saving memory and bandwidth while keeping arithmetic at full precision. This is the standard approach for memory-constrained inference on consumer GPUs using tools like llama.cpp. It does not speed up compute for large batches since the math stays in fp16.",
      "fundamentals": "Inference per layer: 1) Read int4 weights from memory. 2) Dequantize: W_fp16 = scale $\\times$ (W_int4 - zero_point). 3) GEMM: Y = X_fp16 @ W_fp16. Memory bandwidth savings: 4$\\times$ vs fp16. Compute savings: none (still fp16 GEMM). For batch=1 (memory-bound): ~4$\\times$ throughput. For large batches (compute-bound): no speedup.",
      "seen_in": [
        "documentation",
        "benchmark-tables"
      ],
      "related": [
        "awq",
        "fp16",
        "gptq",
        "inference",
        "int4",
        "quantization",
        "tool-llamacpp",
        "w8a8"
      ],
      "sources": [
        "Lin et al., 'AWQ,' arXiv:2306.00978"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "AWQ paper (W4A16 context)",
          "url": "https://arxiv.org/abs/2306.00978"
        }
      ]
    },
    "w8a8": {
      "id": "w8a8",
      "name": "W8A8",
      "expansion": "8-bit Weights, 8-bit Activations",
      "category": "quantization-basics",
      "oneliner": "Both weights and activations quantized to 8-bit — saves memory AND compute via integer arithmetic. Standard for data-center serving.",
      "explanation": "W8A8 quantization stores both weights and activations in 8-bit integer format, enabling integer matrix multiplication on GPU tensor cores. Unlike weight-only quantization that just saves memory, W8A8 provides a real compute speedup because both matrix operands are low-precision. The challenge is quantizing activations at runtime, since their distributions shift with each input. SmoothQuant solved this by rebalancing difficulty from activations to weights.",
      "fundamentals": "Inference: 1) Quantize activation: X_int8 = round(X_fp16/s_x). 2) Read int8 weights. 3) INT8 GEMM: Y_int32 = X_int8 @ W_int8. 4) Dequantize: Y_fp16 = Y_int32 $\\times$ s_x $\\times$ s_w. Key difference from W4A16: saves both bandwidth AND compute. For large batch sizes (compute-bound workloads), W8A8 provides real speedup while W4A16 does not.",
      "seen_in": [
        "documentation",
        "serving-config"
      ],
      "related": [
        "inference",
        "int8",
        "quantization",
        "tool-tensorrt-llm",
        "w4a16"
      ],
      "sources": [
        "Xiao et al., 'SmoothQuant,' arXiv:2211.10438"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "resources": [
        {
          "label": "SmoothQuant paper (W8A8 context)",
          "url": "https://arxiv.org/abs/2211.10438"
        }
      ]
    },
    "weight-tying": {
      "id": "weight-tying",
      "name": "Weight Tying",
      "expansion": "Weight Tying (Shared Input/Output Embeddings)",
      "category": "layer-types",
      "oneliner": "Sharing the same weight matrix between the input token embedding and the output prediction layer, saving parameters and improving quality.",
      "explanation": "Weight tying reuses the token embedding matrix as the output projection that converts hidden states back to vocabulary logits. Since both layers map between the vocabulary and hidden dimensions, sharing them saves significant parameters and forces a consistent token representation. For a vocabulary of 128K and hidden dimension of 4096, this saves over 500 million parameters. Most modern LLMs use weight tying — check the config for tie_word_embeddings.",
      "fundamentals": "Without tying: input embedding $W_E \\in \\mathbb{R}^{|V| \\times d}$, output projection $W_O \\in \\mathbb{R}^{d \\times |V|}$. With tying: $W_O = W_E^\\top$, so logits $= h \\cdot W_E^\\top$. Saves $|V| \\times d$ parameters. For Llama-3 8B ($|V|$=128K, $d$=4096): saves 524M params (~6% of model).",
      "related": [
        "model-head",
        "backbone",
        "token"
      ],
      "seen_in": [
        "model-config",
        "code"
      ],
      "foundational_papers": [
        {
          "title": "Using the Output Embedding to Improve Language Models",
          "authors": "Press & Wolf",
          "venue": "EACL 2017",
          "arxiv": "1608.05859"
        }
      ],
      "resources": [
        {
          "label": "Sebastian Raschka — LLM Architecture Gallery",
          "url": "https://sebastianraschka.com/llm-architecture-gallery/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "yarn": {
      "id": "yarn",
      "name": "YaRN",
      "expansion": "Yet another RoPE extensioN",
      "category": "position-encodings",
      "oneliner": "Improved RoPE context extension that treats different frequency bands separately — untouched, interpolated, or ramped — achieving large context extensions with minimal fine-tuning.",
      "explanation": "YaRN is a method for extending a RoPE-based model's context window well beyond its training length. It improves on simpler scaling approaches by treating different frequency bands separately: high-frequency components encoding local positions are left unchanged, low-frequency components for global positions are interpolated, and a smooth ramp connects the two zones. With only about 400 fine-tuning steps, YaRN achieved 32x context extension on Llama 2.",
      "fundamentals": "Define $r_i$ = $\\lambda_i$/L (wavelength/context ratio). Thresholds $\\alpha$=1, $\\beta$=32. If $r_i$<$\\alpha$: no scaling. If $r_i$>$\\beta$: full interpolation (÷s). Between: ramp $\\gamma_i$ = ($r_i$-$\\alpha$)/($\\beta$-$\\alpha$), scale = 1/(1+$\\gamma_i$·(s-1)). Temperature: attention *= $\\sqrt{1/t}$ where t $\\approx$ 0.1·ln(s)+1. Three-part approach preserves local precision while extending global range — key improvement over uniform NTK. Extension: LLaMA 2 7B/13B to 64K-128K, passkey retrieval passing at 128K.",
      "seen_in": [
        "model-config"
      ],
      "related": [
        "rope",
        "ntk-rope",
        "position-interpolation",
        "abf"
      ],
      "sources": [
        "Peng et al., 'YaRN: Efficient Context Window Extension of Large Language Models,' 2023, arXiv:2309.00071"
      ],
      "confidence": "high",
      "verified_date": "2026-04-10",
      "foundational_papers": [
        {
          "title": "YaRN: Efficient Context Window Extension of Large Language Models",
          "authors": "Peng et al.",
          "venue": "2023",
          "arxiv": "2309.00071"
        }
      ]
    },
    "zero-shot": {
      "id": "zero-shot",
      "name": "Zero-Shot Prompting",
      "expansion": "Zero-Shot Prompting",
      "category": "prompting",
      "oneliner": "Querying a model with only task instructions and no examples, relying entirely on the model's pre-trained knowledge to understand what is being asked.",
      "explanation": "Zero-shot prompting asks a model to perform a task using only a natural language instruction with no examples. The model must understand the task from the description alone, drawing on patterns learned during pre-training and instruction tuning. For instance, asking 'Translate the following sentence to French' without showing any translation examples. Zero-shot performance improves dramatically with model scale and instruction tuning quality.",
      "related": [
        "few-shot",
        "prompt-engineering",
        "sft"
      ],
      "seen_in": [
        "documentation"
      ],
      "foundational_papers": [],
      "resources": [
        {
          "label": "Lilian Weng — Prompt Engineering",
          "url": "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    },
    "zero": {
      "id": "zero",
      "name": "ZeRO",
      "expansion": "Zero Redundancy Optimizer",
      "category": "scaling-patterns",
      "oneliner": "DeepSpeed's memory optimization that shards optimizer states, gradients, and parameters across GPUs to eliminate redundant copies.",
      "explanation": "ZeRO is a family of memory optimizations from DeepSpeed that eliminates redundant copies of model state across GPUs during distributed training. In standard data parallelism, every GPU stores a full copy of weights, gradients, and optimizer states. ZeRO shards these across GPUs in three stages: stage one shards optimizer states, stage two adds gradients, and stage three shards everything including parameters. This enabled training 100B-plus models on commodity GPU clusters.",
      "fundamentals": "With $N$ GPUs and mixed-precision Adam, per-GPU memory under standard DP is $16P$ bytes (2P params + 2P grads + 12P optimizer). ZeRO-1: $4P + 12P/N$. ZeRO-2: $2P + 14P/N$. ZeRO-3: $16P/N$. Communication: ZeRO-1/2 same as allreduce ($2P$ bytes/step). ZeRO-3 adds all-gather ($+P$ bytes), total $3P$ — 1.5x increase.",
      "related": [
        "tp-pp",
        "full-ft"
      ],
      "seen_in": [
        "model-config",
        "code",
        "documentation"
      ],
      "foundational_papers": [
        {
          "title": "ZeRO: Memory Optimizations Toward Training Trillion Parameter Models",
          "authors": "Rajbhandari et al.",
          "venue": "SC 2020",
          "arxiv": "1910.02054"
        }
      ],
      "resources": [
        {
          "label": "DeepSpeed ZeRO documentation",
          "url": "https://www.deepspeed.ai/tutorials/zero/"
        },
        {
          "label": "Lilian Weng — How to Train Really Large Models",
          "url": "https://lilianweng.github.io/posts/2021-09-25-train-large/"
        }
      ],
      "confidence": "high",
      "verified_date": "2026-04-11"
    }
  }
}