Skip to content

大语言模型 LLM

LLM 发展历程

2017: Transformer 论文发表
2018: GPT-1 (1.17亿) - GPT 开创
2018: BERT (3.4亿) - 双向预训练
2019: GPT-2 (15亿) - Zero-shot 能力
2020: GPT-3 (1750亿) - Few-shot,涌现能力
2021: Codex, PaLM (5400亿)
2022: ChatGPT, InstructGPT, GPT-3.5
2023: GPT-4, Claude, LLaMA, Bard
2024: GPT-4o, Claude 3, Gemini 1.5

GPT 系列详解

GPT 架构

python
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embed = nn.Embedding(config.vocab_size, config.hidden_size)
        self.pos_embed = nn.Parameter(torch.zeros(1, config.max_seq_len, config.hidden_size))
        self.drop = nn.Dropout(config.dropout)
        
        self.blocks = nn.ModuleList([
            TransformerBlock(config) for _ in range(config.num_layers)
        ])
        
        self.norm = nn.LayerNorm(config.hidden_size)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        
        # 权重绑定
        self.lm_head.weight = self.embed.weight
    
    def forward(self, x):
        # x: (batch, seq_len)
        x = self.embed(x) + self.pos_embed[:, :x.size(1)]
        x = self.drop(x)
        
        for block in self.blocks:
            x = block(x)
        
        x = self.norm(x)
        return self.lm_head(x)  # (batch, seq_len, vocab_size)

Transformer Block

python
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn = CausalSelfAttention(config)  # 因果注意力
        self.ln1 = nn.LayerNorm(config.hidden_size)
        self.mlp = nn.Sequential(
            nn.Linear(config.hidden_size, config.intermediate_size),
            nn.GELU(),
            nn.Linear(config.intermediate_size, config.hidden_size),
        )
        self.ln2 = nn.LayerNorm(config.hidden_size)
    
    def forward(self, x):
        # Pre-norm: norm(x + attn(x))
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

Causal Attention (因果掩码)

python
class CausalSelfAttention(nn.Module):
    def forward(self, x):
        B, T, C = x.shape
        
        # Q, K, V 投影
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        
        # 分头
        head_dim = C // self.num_heads
        q = q.view(B, T, self.num_heads, head_dim).transpose(1, 2)
        k = k.view(B, T, self.num_heads, head_dim).transpose(1, 2)
        v = v.view(B, T, self.num_heads, head_dim).transpose(1, 2)
        
        # 注意力分数
        scale = head_dim ** -0.5
        scores = torch.matmul(q, k.transpose(-2, -1)) * scale
        
        # 因果掩码: mask out future positions
        causal_mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
        scores.masked_fill_(causal_mask, float('-inf'))
        
        # Softmax
        attn = F.softmax(scores, dim=-1)
        attn = self.attn_drop(attn)
        
        # 输出
        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        
        return self.out_proj(out)

Attention 变体

Flash Attention

python
# 标准 Attention: O(N²) 内存
# Flash Attention: O(N) 内存,通过分块计算

# 使用 FlashAttention-2
from flash_attn import flash_attn_func

def forward(self, q, k, v):
    # q, k, v: (B, T, H, D)
    return flash_attn_func(q, k, v, causal=True)

Multi-Query Attention (MQA)

python
# 标准 Multi-Head: 每个头有独立的 K, V
# MQA: 所有头共享 K, V

# 减少 K, V 投影参数量
# 推理加速(KV Cache 更小)

Grouped-Query Attention (GQA)

python
# MQA: 1 组 K, V
# GQA: N 组 K, V (N < num_heads)
# 是 MHA 和 MQA 的平衡

# Llama 2 使用 GQA

GPT-3 扩展技术

Sparse Attention (稀疏注意力)

python
# 局部窗口注意力: 只关注附近 Token
# + 全局注意力: 某些特殊 Token 关注所有位置

class SparseAttention(nn.Module):
    def forward(self, x):
        # 局部窗口
        local_attn = self.local_attn(x)
        
        # 全局 Token
        global_attn = self.global_attn(x, global_tokens)
        
        return combine(local_attn, global_attn)

Positional Encoding: RoPE

python
# Rotary Position Embedding (LLaMA 使用)
# 不使用绝对位置编码,而是旋转

def rotate_half(x):
    x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
    return torch.cat([-x2, x1], dim=-1)

def apply_rotary_pos_emb(q, k, cos, sin):
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

RLHF (Reinforcement Learning from Human Feedback)

三阶段训练

1. SFT (Supervised Fine-Tuning)
   - 人工编写的高质量对话数据
   - 监督学习微调
   
2. Reward Model
   - 训练奖励模型预测人类偏好
   - 输入: (prompt, response)
   - 输出: 评分
   
3. PPO (Proximal Policy Optimization)
   - 用 Reward Model 强化学习优化
   - 避免偏离 SFT 太远 (KL 散度惩罚)
python
# SFT
dataset = load_sft_data()  # 人工编写的优质对话
model = load_pretrained_model()
model = model.train()
for batch in dataloader:
    outputs = model(**batch)
    loss = F.cross_entropy(outputs.logits, batch['labels'])
    loss.backward()

# Reward Model
class RewardModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base = base_model
        self.reward_head = nn.Linear(hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.base(input_ids, attention_mask)
        # 取最后一层的 [CLS] 或 最后一个 token 的输出
        reward = self.reward_head(outputs.last_hidden_state[:, -1])
        return reward

# PPO
class PPOTrainer:
    def __init__(self, policy_model, ref_model, reward_model):
        self.policy = policy_model
        self.ref = ref_model
        self.reward = reward_model
    
    def compute_kl(self, prompt_ids, response_ids):
        # 计算与参考模型的 KL 散度
        ref_logprobs = self.ref(**prompt, response).logprobs
        policy_logprobs = self.policy(**prompt, response).logprobs
        return (ref_logprobs - policy_logprobs).mean()
    
    def step(self, prompts, responses, rewards):
        # 计算优势
        advantages = standardize(rewards)
        
        # PPO 损失
        ratio = torch.exp(logprobs - old_logprobs)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1-eps, 1+eps) * advantages
        policy_loss = -torch.min(surr1, surr2).mean()
        
        # KL 惩罚
        kl_loss = self.compute_kl(prompts, responses)
        
        total_loss = policy_loss - kl_coef * kl_loss

模型规模与涌现能力

模型参数量涌现能力
GPT-215亿
GPT-31750亿文本补全、简单推理
GPT-4~1万亿复杂推理、代码生成、多模态

涌现能力例子

- 思维链 (Chain-of-Thought)
- 零样本泛化
- 代码生成
- 上下文学习 (In-Context Learning)
- 多步推理

Prompt Engineering

python
# 1. Zero-shot
prompt = "Translate to French: Hello, how are you?"
response = model.generate(prompt)

# 2. Few-shot (In-Context Learning)
prompt = """Translate to French:
English: Hello
French: Bonjour
English: How are you?
French:"""
response = model.generate(prompt)

# 3. Chain-of-Thought
prompt = """Q: John has 5 apples. He gives 2 to Mary. How many does he have?
A: John started with 5 apples. He gave away 2. 5 - 2 = 3. The answer is 3.

Q: If a train travels 120 miles in 2 hours, what is its speed?"""
response = model.generate(prompt)

# 4. Constitutional AI
prompt = """You are a helpful assistant.
Based on the following principles, critique and revise your response:
1. Be helpful and informative
2. Be honest about limitations
3. Avoid harmful content
...
"""

LLM 推理优化

KV Cache

python
# 标准 Attention: 每个 token 都要重新计算所有 KV
# KV Cache: 缓存之前的 K, V

class ModelWithKVCache:
    def __init__(self):
        self.kv_cache = None
    
    def forward(self, x, kv_cache=None):
        q = self.q_proj(x)
        k_new = self.k_proj(x)
        v_new = self.v_proj(x)
        
        if kv_cache is not None:
            k = torch.cat([kv_cache['k'], k_new], dim=2)
            v = torch.cat([kv_cache['v'], v_new], dim=2)
        else:
            k, v = k_new, v_new
        
        out = self.attn(q, k, v)
        return out, {'k': k, 'v': v}

量化

python
# FP16/BF16 → INT8/INT4

# 训练后量化 (PTQ)
# - 动态量化: weights 用 int8,推理时动态转换
# - 静态量化: 离线标定,保存量化参数
# - QAT (Quantization-Aware Training): 在训练中模拟量化

# GPTQ / AWQ / GGUF 格式

# bitsandbytes 示例
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

推理框架

python
# vLLM: PagedAttention,连续批处理
# TensorRT-LLM: NVIDIA 优化
# llama.cpp: CPU/GPU 推理,GGUF 格式
# Ollama: 简化部署

主流开源 LLM

模型参数量特点
LLaMA 27B-70BMeta,开源基础模型
Code LLaMA7B-34B代码专用
Mistral7B高效,MoE 架构
Mixtral8×7BMoE (8 个专家)
Vicuna7B-33BChatGPT 对话
Falcon7B-180B技术优化
Qwen7B-72B阿里,中文
ChatGLM6B-130B清华,中文
Yi6B-34B零一万物
DeepSeek7B-67B国产,开源强

微调方法

python
# 1. 全量微调 (Fine-tune all)
# - 更新所有参数
# - 需要大量 GPU 显存
# - 效果好但成本高

# 2. LoRA (Low-Rank Adaptation)
# - 只更新低秩矩阵
# - 大幅减少参数量

class LoRALinear(nn.Module):
    def __init__(self, linear, rank=4, alpha=1):
        super().__init__()
        self.linear = linear
        self.lora_A = nn.Parameter(linear.weight.new_zeros((rank, linear.in_features)))
        self.lora_B = nn.Parameter(linear.weight.new_zeros((linear.out_features, rank)))
        self.scaling = alpha / rank
    
    def forward(self, x):
        return self.linear(x) + x @ self.lora_A.T @ self.lora_B.T * self.scaling

# 使用
model = load_pretrained_model()
for name, module in model.named_modules():
    if isinstance(module, nn.Linear) and 'lm_head' in name:
        module = LoRALinear(module, rank=8)

# 3. QLoRA (Quantized LoRA)
# - 4-bit 量化基础模型 + LoRA
# - 单卡 24GB 可微调 65B 模型

# 4. Adapter
# - 在 Transformer 层之间插入小型适配器模块

RAG (Retrieval-Augmented Generation)

python
class RAGSystem:
    def __init__(self, llm, embedder, vector_db):
        self.llm = llm
        self.embedder = embedder
        self.vector_db = vector_db
    
    def generate(self, query, top_k=5):
        # 1. 查询向量化
        query_vec = self.embedder.encode(query)
        
        # 2. 检索相关文档
        docs = self.vector_db.search(query_vec, top_k)
        
        # 3. 构建 prompt
        context = "\n".join([doc.content for doc in docs])
        prompt = f"""Based on the following context, answer the question.

Context:
{context}

Question: {query}

Answer:"""
        
        # 4. LLM 生成
        return self.llm.generate(prompt)

Agent 系统

python
class LLMAgent:
    def __init__(self, llm, tools):
        self.llm = llm
        self.tools = tools  # 工具列表
    
    def plan(self, task):
        # 思考下一步行动
        prompt = f"""Task: {task}
Available tools: {list(self.tools.keys())}

What should I do next?"""
        
        response = self.llm.generate(prompt)
        
        # 解析工具调用
        tool_name, args = parse_tool_call(response)
        
        return tool_name, args
    
    def execute(self, task, max_steps=10):
        history = []
        for _ in range(max_steps):
            tool, args = self.plan(task, history)
            
            if tool is None:
                break  # 任务完成
            
            result = self.tools[tool](**args)
            history.append((tool, args, result))
        
        return self.summarize(task, history)

面试要点

1. Transformer vs RNN
   - 并行计算(训练快)
   - 长距离依赖
   - 全局注意力

2. GPT vs BERT
   - GPT: 单向 (Causal),生成式
   - BERT: 双向,理解式,MLM 预训练

3. Attention 复杂度
   - O(N²·D) 时间
   - O(N²) 空间(KV Cache 优化)

4. 训练稳定性
   - 权重初始化
   - 混合精度
   - 学习率调度 (cosine warmup)

5. LLM 推理优化
   - KV Cache
   - 量化 (INT8/INT4)
   - Flash Attention
   - Continuous Batching

6. RLHF 流程
   - SFT → Reward Model → PPO

7. LoRA 原理
   - 低秩分解
   - 只更新 A, B 矩阵
   - 冻结原始权重

8. 涌现能力
   - 参数量超过阈值后突然出现
   - 思维链、零样本等

基于 MIT 许可发布